Synopsis

This is a comparison of phonetic differentiation of vowels in bilingual speakers of Basque and Castillian. A total of 38 individuals participated in this study, of which 89.47% were female and 10.53% were male. All speakers were bilingual, however, L1 and L2 are unknown due to a lack of information shared by the original researchers as no other information was provided to accompany this data set. It might be assumed that L1 would have been Basque as the vocabulary used in the study are typical of the Basque language. Participants were asked to pronounce a list of 10 words (bata, batu, beta, bete, bita, bitak, bota, botu, puta, putak,), each of which was produced a total of five times, these are word pairs for the vowels a,e,i,o,u, typical of both Basque and Castillian.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(vowels)
library(glue)

####################### R e a d    D a t a     F r a m e #######################
vowel_data_Basque_Castillian <- read.csv("C:/Users/jgarc/OneDrive/Desktop/APLN_563/APLN563_Final_Project/vowel_data (1).csv", header=FALSE)


####################### R e n a m e    C o l u m n s #######################
colnames(vowel_data_Basque_Castillian) <- c(
  "id", "language", "gender", "vowel",
  "F1_mid", "F2_mid", "F3_mid", "words"
)


####################### R e m o v e    H e a d e r     R o w ###################
df <- vowel_data_Basque_Castillian[-1, ]

####################### E n s u r e     N u m e r i c     C o n v e r s i o n ##
df <- df %>%
  mutate(across(c(F1_mid, F2_mid, F3_mid), as.numeric))

####################### D a t a   F r a m e   S t a t i s t i c s ##############
df_summary <- df %>% summarize( 
  m_f1 = mean(F1_mid, na.rm = TRUE),
  sd_f1 = sd(F1_mid, na.rm = TRUE),
  m_f2 = mean(F2_mid, na.rm = TRUE),
  sd_f2 = sd(F2_mid, na.rm = TRUE),
  m_f3 = mean(F3_mid, na.rm = TRUE),
  sd_f3 = sd(F3_mid, na.rm = TRUE) )

print(df_summary)
##       m_f1    sd_f1     m_f2    sd_f2     m_f3    sd_f3
## 1 506.1294 157.1423 1638.473 658.5282 2923.521 252.3125
###################### A d d     T i b b l e ###################

n_distinct_id <- (n_distinct(df$id)) # distinct id
#print(glue("     n = ",{n_distinct_id}))



count_female <- df %>% # % female
  filter(gender == "f") %>%
  summarise(n = n_distinct(id)) %>%
  pull(n)
#print(glue("     Female Participant count: ", {count_female}))

percent_female <- round((count_female / n_distinct_id) * 100, 2)
#print(glue("     Female: {percent_female} %"))


#count_male <- sum(df$gender == "m")
#print(glue("     Male Participant count: ", {count_male}))

count_male <- df %>% # % male
  filter(gender == "m") %>%
  summarise(n = n_distinct(id)) %>%
  pull(n)
percent_male <- round((count_male / n_distinct_id) * 100, 2)
#print(glue("     Male Participant count: ", {count_male}))
#print(glue("     Male: {percent_male} %"))


n_distinct_words <- (n_distinct(df$words)) # distinct words
#print(glue("     Distinct Word Count: ",{n_distinct_words}))


sorted_distinct_words <- sort(unique(df$words)) # print distinct words
#print(glue("     Distinct Words: {paste(sorted_distinct_words, collapse = ', ')}"))


####################### A d d    T i b b l e #######################
df_summary <- tibble(
  N = n_distinct_id,
  Male = count_male,
  Percent_Male = percent_male,
  Female = count_female,
  Percent_Female = percent_female,
  Word_Count = n_distinct_words
)
print(df_summary)
## # A tibble: 1 × 6
##       N  Male Percent_Male Female Percent_Female Word_Count
##   <int> <int>        <dbl>  <int>          <dbl>      <int>
## 1    38     4         10.5     34           89.5         10
####################### S p l i t     B y     L a n g u a g e ##################
df_basque <- df %>% filter(language == "basque")
df_spanish <- df %>% filter(language == "spanish")



####################### P l o t     F u n c t i o n #######################
plot_vowel_space <- function(data, lang_label) {
  ggplot(data, aes(x = F2_mid, y = F1_mid, color = vowel)) +
    #geom_point(alpha = 0.6, size = 2) +
    stat_ellipse(type = "norm", level = 0.95) +
    geom_text(aes(label = vowel), vjust = -0.5, size = 3, show.legend = FALSE) +
    scale_x_reverse() +
    scale_y_reverse(limits = c(1200, 200), breaks = seq(200, 1200, 200)) +
    facet_wrap(~gender) +
    labs(
      x = "F2 (Hz)",
      y = "F1 (Hz)",
      title = paste(lang_label, "Vowel Space (F1–F2)"),
      subtitle = "Elipse = 95% confidence"
    ) +
    theme_minimal()
}



####################### G e n e r a t e     P l o t s #######################

## B a s q u e     V o w e l     S p a c e ##
plot_vowel_space(df_basque, "Basque")

## C a s t i l l i a n     V o w e l    S p a c e ##
plot_vowel_space(df_spanish, "Spanish")

####################### B a s q u e   vs.   S p a n i s h  C o m p a r i s o n #
ggplot(df, aes(x = F2_mid, y = F1_mid, color = language, shape = vowel)) +
  geom_point(alpha = 0.6, size = 2) +
  stat_ellipse(type = "norm", level = 0.95) +
  scale_x_reverse() +
  scale_y_reverse(limits = c(1200, 200), breaks = seq(200, 1200, 200)) +
  facet_wrap(~gender) +
  labs(
    x = "F2 (Hz)",
    y = "F1 (Hz)",
    title = "Basque vs. Spanish Vowel Space",
    subtitle = "Elipse = 95% confidence"
  ) +
  theme_minimal()

df_grouped <- df %>% group_by(language, gender, vowel) %>%
  summarise( m_f1 = mean(F1_mid, na.rm = TRUE),
             sd_f1 = sd(F1_mid, na.rm = TRUE),
             m_f2 = mean(F2_mid, na.rm = TRUE),
             sd_f2 = sd(F2_mid, na.rm = TRUE),
             .groups = "drop" )

ggplot(df_grouped, aes(x = m_f2, y = m_f1, color = language, shape = vowel)) +
  geom_point(size = 3) +
  # Vertical error bars (F1 variation)
  geom_errorbar(
    aes(ymin = m_f1 - sd_f1, ymax = m_f1 + sd_f1),
    width = 0.1
  ) +
  # Horizontal error bars (F2 variation)
  geom_errorbarh(
    aes(xmin = m_f2 - sd_f2, xmax = m_f2 + sd_f2),
    height = 0.1
  ) +
  scale_x_reverse() +
  scale_y_reverse(limits = c(1200, 200), breaks = seq(200, 1200, 200)) +
  facet_wrap(~gender) +
  labs(
    x = "F2 (Hz)", y = "F1 (Hz)",
    title = "Basque vs. Spanish Vowel Space with SD"
  ) +
  theme_minimal()
## Warning: `geom_errobarh()` was deprecated in ggplot2 4.0.0.
## ℹ Please use the `orientation` argument of `geom_errorbar()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `height` was translated to `width`.

Analysis

Based on the data collected, vowel production in both male and female participants seems to fall within a similar F1 range regardless of language spoken when considering Basque vs. Castillian, however it does seem that Basque vowels are slightly more fronted hence their lower F2. F1 range for male participants does seem to vary slightly more than in female participants when comparing across languages however this variance is still within standard deviation, it is interesting to note that it occurred in all vowel cases except for ” i “. Most participants in the study were female and therefore are disproportionately represented in the data, however most participants fall within similar standard deviation ranges when considering vowel production by gender across the languages observed.

Considerations

There was no control group for this study, or at least no control group has been reference in any of the materials uploaded to OSF(see citations for database url) by the original researchers.

Citations

Li, P. (2025, November 28). Same Vowels, Distinct Sounds: Phonetic Differentiation in Early Basque-Spanish Bilinguals’ Vowel Production. Retrieved from osf.io/mxjpd