Step1: Data Cleaning for Individuals

First check the directory. To check the directory use getwd(), and use setwd() to set the directory if not done yet.

getwd()

## [1] "/Users/saurabhkumarnath/Desktop/Transcription/FastTrack"

setwd("/Users/saurabhkumarnath/Desktop/Transcription/FastTrack")

Now, load all the libraries needed for the project.

library(dplyr)
library(data.table)
library(ggplot2)
library(vowels)
library(ggpubr)
library(phonR)
library(phonTools)

Let’s import the required files from Fastrack. There are two ways, one is to use file.choose() or paste the file path. FastTrack output files are usually broken. Correct it mannualy before importing. file, output file, & vowel A

#import1 <- read.csv(file.choose())
#import2 <- read.csv(file.choose())
import1 <- read.csv("/Users/saurabhkumarnath/Desktop/kamrup/kusum/processed_data/aggregated_data.csv")
import2 <- read.csv("/Users/saurabhkumarnath/Desktop/kamrup/kusum/KamrupHajoKusumchoudhuryF01H6_segmentation_info.csv")

Run the following to see the files before proceeding to merge.

#View(import1)
#View(import2)

Now, merge the files. Check if the two column names file and outputfile already present in the imported files.

import <- merge(import1, import2, by.x = "file", by.y = "outputfile", all.y = TRUE)

Add syllable column. This will give us the syllable structure of the works in terms of c and v

import$syllable <- import$word
import$syllable <- gsub("ph|th|kh|bh|dh|gh|ng|p|t|k|b|d|g|t|r|l|h|x|n|s|z|m", "c", import$syllable)
import$syllable <- gsub("a|e|i|o|u|y", "v", import$syllable)
#Concatenate strings without space between 'c' and 'v'
import$syllable <- gsub("c\\s*v", "cv", import$syllable, ignore.case = TRUE, perl = TRUE)

To view the output use View(import). The following code will provide use the position of the target vowel.

# Count number of 'v's in each syllable
import$vowelnumbers <- sapply(strsplit(import$syllable, ""), function(x) sum(x == "v"))
import1 <-import %>%
  group_by(vowelnumbers, word_start) %>%
  arrange(start) %>%
  mutate(v_rank = row_number())
import2 <- import1 %>%
mutate(vposition = case_when(
  vowelnumbers == 1 ~ "only",
  vowelnumbers == 2 & v_rank %% 2 == 1 ~ "first",
  vowelnumbers == 2 & v_rank %% 2 == 0 ~ "second",
  vowelnumbers == 3 & (v_rank %% 3 == 1) ~ "first",
  vowelnumbers == 3 & (v_rank %% 3 == 2) ~ "second",
  vowelnumbers == 3 & (v_rank %% 3 == 0) ~ "third",
  TRUE ~ "later"
))

To set the result, use View(import) Check overall frequency of words by syllable structure

syllable_frequency <- table(import2$syllable)
# Sort the frequencies in descending order
syllable_frequency_sorted <- sort(syllable_frequency, decreasing = TRUE)
# Print the sorted frequency of values in the "syllable" column
print(syllable_frequency_sorted)

## 
##       cvcv        cvv      cvcvc      cvcvv        vcv     cvcvcv        cvc 
##        330        140        114        114        112        105         77 
##     cvcvvv         cv       vcvc      cvccv   cvcvcvcv      vcvcv    cvcvcvc 
##         76         51         46         42         28         24         21 
##    cvvcvcv    vcvcvcv      cvvcv    cvcvcvv       vccv       vcvv    cvccvcv 
##         20         20         18         16         16         15         12 
##   cvccvcvc  cvcvcvcvc       cvvc     cvvcvc       cvvv     vcvcvc      vcvvv 
##         12         12         12         12         12         12         12 
##       vvcv   cvvvcvcv          v     vcvvcv     cvccvc     vccvcv      vvccv 
##         12         10          9          8          6          6          6 
##  cvcvvcvcv  cvvcvcvcv cvvcvcvcvc   vcvcvvvc  vvcvcvcvc   cvcvccvv    cvcvvvc 
##          5          5          5          5          5          4          4 
##   cvvcvcvc     cvvcvv     cvvvcv      cvwvc   vccvccvv     vvcvcv    vvcvcvc 
##          4          4          4          4          4          4          4 
##      vvcvv      vvvcv     ccvvcv     cvccvv     cvcvvc     cvvccv      cvvvc 
##          4          4          3          3          3          3          3 
##     cvwvcv    vccvcvc    ccvccvc       ccvv         vc         vv       ccvc 
##          3          3          2          2          2          2          1 
##       cvcc        cvw 
##          1          1

Remove unnecesary token from present data, only keep cvc, cvcvc, cvccv, cvcv, cvccvc and their first vowels

import3<-import2%>%
  filter(syllable == "cvcv" & vposition == "first" | syllable == "cvcvc"
         & vposition == "first" | syllable == "cvccv" & vposition == "first" |
           syllable == "cvc" | syllable == "cvccvc" & vposition == "first")

Check selected words’ frequency

frequency_table <- table(import3$syllable, import3$vowel)
# Print the frequency table
print(frequency_table)

##         
##           a  A  e  i  u
##   cvc    43 26  1  4  3
##   cvccv  11  5  1  1  3
##   cvccvc  2  0  0  0  1
##   cvcv   55 47 14 30 19
##   cvcvc  24 19  7  4  3

import4<-import3%>%
  filter(vowel == "a" | vowel == "A" | vowel =="e" | vowel == "o" | vowel == "u" | vowel == "i")

Add the height of following vowels

import5 <- import4 %>%
  mutate(fvowel = ifelse(vowelnumbers == 2 & vposition == "first", sub(".*([aeiouAEIOU]).*", "\\1", word), "nil"))
# Modify the fvowelheight column based on fvowel value
import6 <- import5 %>%
  mutate(fvowelheight = ifelse(fvowel == "i" | fvowel == "u", "high", 
                               ifelse(fvowel == "nil", "nil", "nonhigh")))

Add syllabletype of the target vowel

import7 <- import6 %>%
  mutate(syllabletype = case_when(
    vowelnumbers == 1 & substr(syllable, nchar(syllable), nchar(syllable)) == "c" ~ "closed",
    vowelnumbers == 1 & substr(syllable, nchar(syllable), nchar(syllable)) == "v" ~ "open",
    vowelnumbers == 2 & grepl("[v]cc[v]", syllable) ~ "closed",
    vowelnumbers == 2 & grepl("[v]c[v]", syllable) ~ "open",
    TRUE ~ "nil"
  ))
import8  <- import7 %>%
  mutate(syllabletype = case_when(
    vowelnumbers == 2 & vposition == "second" & substr(syllable, nchar(syllable), nchar(syllable)) == "c" ~ "closed",
    vowelnumbers == 2 & vposition == "second" & substr(syllable, nchar(syllable), nchar(syllable)) == "v" ~ "open",
    TRUE ~ syllabletype
  ))
import9 <- import8 %>%
  select(-duration.x, -v_rank, -fvowel,-label, -group, -color, -number, -interval, -omit, -word_interval)

Add following consonant of the target vowel

vowelalldata11 <- import9 %>%
  mutate(fconsonant = case_when(
    syllabletype == "closed" & (vposition == "second" | vposition == "only") ~ substr(word, nchar(word) - 1, nchar(word)),
    syllabletype == "closed" & vposition == "first" ~ {
      first_vowel_index <- regexpr("[aeiouAEIOU]", word)
      substr(word, first_vowel_index + 1, first_vowel_index + 2)
    },
    syllabletype == "open" ~ "q",
    TRUE ~ "q" # Handle any other cases with nil
  ))

# Remove vowels from the fconsonant column
vowelalldata12 <- vowelalldata11 %>%
  mutate(fconsonant = gsub("[aeiouAEIOU]", "", fconsonant))

# Assuming your dataframe is named vowelalldata12
vowelalldata12$fconsonant <- gsub("(ph|th|kh|bh|dh|gh|ng)|([a-zA-Z])([a-zA-Z])", "\\1\\2", vowelalldata12$fconsonant)

# Assuming your dataframe is named vowelalldata12
vowelalldata12$fconsonant <- ifelse(vowelalldata12$syllabletype == "open" & vowelalldata12$vposition == "first",
                                    gsub(".*[aeiouAEIOU]([a-zA-Z]+)[aeiouAEIOU].*", "\\1", vowelalldata12$word),
                                    vowelalldata12$fconsonant)

# Assuming your dataframe is named vowelalldata12
vowelalldata12$maner <- case_when(
  vowelalldata12$fconsonant %in% c("p", "t", "k", "b", "d", "g") ~ "stops",
  vowelalldata12$fconsonant %in% c("ph", "th", "kh", "bh", "dh", "gh") ~ "aspirated",
  vowelalldata12$fconsonant %in% c("s", "z", "x", "h") ~ "fricative",
  vowelalldata12$fconsonant %in% c("r", "l") ~ "nonobstruents",
  vowelalldata12$fconsonant %in% c("m", "n", "ng") ~ "nasal",
  vowelalldata12$fconsonant == "q" ~ "q", # Keep 'q' as it is
  TRUE ~ NA_character_ # If none of the above conditions are met, assign NA
)

Indentify E harmony vowels, basically CVCV

# Create a new column svowel with the same values as the original vowel column
vowelalldata12$svowel <- vowelalldata12$vowel

# Apply the condition to update values
vowelalldata12$svowel[vowelalldata12$svowel == "e" & vowelalldata12$vposition == "first" & vowelalldata12$syllabletype == "open" & vowelalldata12$fvowelheight == "high"] <- "E"

# Change 'e' to 'E' and 'E' to 'e' in the 'svowel' column simultaneously
vowelalldata12$svowel <- chartr("eE", "Ee", vowelalldata12$svowel)

Similarly, indentify A harmony vowels

# Apply the condition to update values
vowelalldata12$svowel[vowelalldata12$svowel == "A" & vowelalldata12$vposition == "first" & vowelalldata12$syllabletype == "open" & vowelalldata12$fvowelheight == "high"] <- "M"

#remove -- rows,
vowelalldata12 <- vowelalldata12[vowelalldata12$file != "--", ]
#View(vowelalldata12)

Check selected vowels’ distribution

frequency_table <- table(vowelalldata12$syllable, vowelalldata12$svowel, vowelalldata12$fvowelheight)
# Print the frequency table
print(frequency_table)

## , ,  = high
## 
##         
##           a  A  e  E  i  M  u
##   cvc     0  0  0  0  0  0  0
##   cvccv   3  2  0  1  0  0  0
##   cvccvc  0  0  0  0  0  0  0
##   cvcv   28  0 10  0 17 24  9
##   cvcvc   9  0  3  0  0 13  0
## 
## , ,  = nil
## 
##         
##           a  A  e  E  i  M  u
##   cvc    41 22  0  1  4  0  3
##   cvccv   0  0  0  0  0  0  0
##   cvccvc  0  0  0  0  0  0  0
##   cvcv    0  0  0  0  0  0  0
##   cvcvc   0  0  0  0  0  0  0
## 
## , ,  = nonhigh
## 
##         
##           a  A  e  E  i  M  u
##   cvc     0  0  0  0  0  0  0
##   cvccv   8  3  0  0  1  0  3
##   cvccvc  2  0  0  0  0  0  1
##   cvcv   26 14  0  4  7  0  6
##   cvcvc  14  4  0  4  3  0  3

#View(vowelalldata12)

Save file

#output_file <- "/Users/saurabhkumarnath/Desktop/Transcription/ranju/ranjufinal.csv"
#output_file <- "/Users/saurabhkumarnath/Desktop/Transcription/sunita/sunitafinal.csv"
#output_file <- "/Users/saurabhkumarnath/Desktop/Transcription/Anima/animafinal.csv"
#output_file <- "/Users/saurabhkumarnath/Desktop/Transcription/Girija/girijafinal.csv"
#output_file <- "/Users/saurabhkumarnath/Desktop/Transcription/parme/parmefinal.csv"
#output_file <- "/Users/saurabhkumarnath/Desktop/Transcription/gayari/gayarifinal.csv"
#output_file <- "/Users/saurabhkumarnath/Desktop/Transcription/pratibha/pratibhafinal.csv"
#output_file <- "/Users/saurabhkumarnath/Desktop/bongaigaon/ashokfinal.csv"
#output_file <- "/Users/saurabhkumarnath/Desktop/bongaigaon/dilipfinal.csv"
#output_file <- "/Users/saurabhkumarnath/Desktop/bongaigaon/nimalfinal.csv"
#output_file <- "/Users/saurabhkumarnath/Desktop/bongaigaon/raheswarfinal.csv"
#output_file <- "/Users/saurabhkumarnath/Desktop/kamrup/birendrabaihatafinal.csv"
#output_file <- "/Users/saurabhkumarnath/Desktop/kamrup/gitikachaigaonfinal.csv"
#output_file <- "/Users/saurabhkumarnath/Desktop/kamrup/jogeshfinal.csv"
#output_file <- "/Users/saurabhkumarnath/Desktop/kamrup/binafinal.csv"
#output_file <- "/Users/saurabhkumarnath/Desktop/kamrup/kusumfinal.csv"
#write.csv(vowelalldata12, file = output_file, row.names = FALSE)

Step1: Data Cleaning for Individuals

Saurabh Nath