--- title: 'Mapping boundaries: Microprosodic and dialectal variability in the perception of Estonian long and overlong quantity' author: "Pärtel Lippus" date: "`r Sys.Date()`" output: html_document --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` This is the R code for conducting the analysis published in Lippus, Pärtel, Liis Kask, Sofia Lutter, Nele Põldver, Kairi Kreegipuu. 2025. Mapping boundaries: Microprosodic and dialectal variability in the perception of Estonian long and overlong quantity. *Eesti Rakenduslingvistika Ühingu aastaraamat = Estonian Papers in Applied Linguistics* 21. Load packages and Estonian base map for plotting the geoinfo. ```{r, message=F, warning=F} library(tidyverse) library(flextable) # aluskaart library(rworldmap) library(ggthemes) data("countryExData", envir = environment(), package = "rworldmap") mymap = joinCountryData2Map(countryExData, joinCode = "ISO3", nameJoinColumn = "ISO3V10", mapResolution = "low") %>% fortify(mymap) %>% subset(id=="Estonia") ``` ## The data set and summary of the factors The data comes from an online perception study carried out in the Kaemus online experiment enviroment. The stimuli for the perception study were recorded from a female Estonian speaker and re-synthesised in Praat. The original recordings, the Praat scripts for running the manipulation of durations and f0 contours and stimulus files are included in the current repository. The perception study data has been converted from Kaemus output to Rda file, it has been manually sorted and modified (adding latitude and longitude data for the locations, for example). Read the full dataset. ```{r} load("mapping_boundaries_dataset_full.Rda") # remove rows with no response dat<-uuring[!is.na(uuring$response),] dat$q3resp <- as.numeric(dat$response=="Q3") ``` The table `dat` has the following columns: - _KI_ -- participant ID. The number after underscore is the year of the data collection (2020 or 2021) - _age_ -- participant's age in years - _gender_ -- levels F (female)/M (male) - _education_ -- levels: kesk (secondary) / kesk-eri (vocational education) / BA / MA / PhD - _rhythm_ -- sense of rythm (self reported) - _mothertongue_ -- levels eesti (Estonian) / vene (Russian) - _language_skills_ -- low / medium / high summarised from self reported second language skills - _dialects_ -- knowledge of Estonian dialects (and the level) - _speechrate_ -- response to "how often others say you are a fast speaker", six levels from "never" to "very often" - _speechrate_self_ -- response to "do you consider yourself as a fast speaker", four levels from "no" to "yes" - _hear_out_of_tune_ -- response to "do you hear if singing is out of tune", three levels from "no" to "yes" - _place_ -- participant's place of habitation summarised from the reported list of locations - _place_lat_ -- latitude of the location - _place_lon_ -- longitude of the location - _mother_place_ -- participant's mother's place of habitation - _mother_lat_ - _mother_lon_ - _place_of_living_ -- particiapnt's current place of habitation - _place_of_living_lat_ - _place_of_living_lon_ - _father_place_ -- participant's father's place of habitation - _father_lat_ - _father_lon_ - _ID_ -- stimulus ID in the Kaemus experiment - _stimulus_ -- stimulus file name - _base_word_ -- the phonemic sequences sequence of the base word - _base_quant_ -- the original quantity of the base word, levels: Q1, Q2, Q3 - _manip_ -- what was manipulated: vok1 = duration of V1, vok2 = duration of V1 and V2, f0 = the pitch contour - _v1dur_ -- V1 duration of the stimulus in milliseconds - _v2dur_ -- V2 duration of the stimulus in milliseconds - _response_ -- participant's responce to the stimulus, levels: Q1, Q2, Q3 - _q3resp_ -- binary value if the responce is Q3 (1) or not (0) ## Prepare the data for the current study We leave out all participants who did not recognise the control stimuli with Q1 properties correctly as Q1. ```{r} valesti_vastanud_KId <- unique(dat$KI[which(dat$base_quant=="Q1" & dat$v1dur==110 & dat$response!="Q1")]) dat <- dat[!dat$KI %in% valesti_vastanud_KId,] ``` Leave out the trials where the responce to other stimuli was Q1. ```{r} dat <- dat[dat$v1dur!=110 & dat$response!="Q1",] # refactor the responce to leave out Q1 which is now an empty category dat$response <- factor(dat$response) ``` Add factor `pitch_cue` to describe the pitch contour of the stimuli. If the base word was Q2 and the stimuli are created by manipulation of durations (Sets 1 & 2) or the base word was Q3 and the stimuli are created by manipulation of f0 (Set 3), the pitch is level (in the stressed syllable). Otherwise (Q3 baseword Set 1 & 2 and Q2 baseword Set 3) the pitch is falling. ```{r} # teeb põhitooni kirjeldava faktori pitch: level/fall dat$pitch_cue[(dat$manip == "f0" & dat$base_quant == "Q2") | (dat$manip != "f0" & dat$base_quant == "Q3")] <- "fall" dat$pitch_cue[(dat$manip == "f0" & dat$base_quant == "Q3") | (dat$manip != "f0" & dat$base_quant != "Q3")] <- "level" dat$pitch_cue <- factor(dat$pitch_cue, levels = c("level","fall")) ``` Calculate the __Pitch Sensitivity Scores__ (`dur2pitch_score`). ```{r} # kestuse-vs-põhitooni skoor for(i in unique(dat$KI)){ tmp <- dat[dat$KI==i & dat$base_word!="tada",] dat$dur2pitch_score[dat$KI==i] <- length(which( (tmp$response=="Q3" & tmp$pitch_cue=="level") | (tmp$response=="Q2" & tmp$pitch_cue=="fall") )) / nrow(tmp) } ``` Split the dataset by speakers' mother tongue. In this study we continue by analysing the __Estonian L1 subset__ in `dat.ee`. ```{r} dat.ee <- dat[dat$mothertongue=="eesti",] dat.vn <- dat[dat$mothertongue!="eesti",] ``` The table `dat.ee` has the data by stimuli. Now we create a table `ki.ee` with __data by participants__. ```{r} dat.ee %>% distinct(KI, age, gender, mother_place, mother_lat, mother_lon, place, place_lat, place_lon, father_lat, father_lon, education, dur2pitch_score, rhythm, hear_out_of_tune, language_skills, dialects,speechrate, speechrate_self) -> ki.ee ``` Save the subset tables as one Rda file and also separately as CSV files. ```{r, eval=FALSE} save(file="mapping_boundaries_dataset_filtered.Rda", dat.ee, ki.ee) write_csv(dat.ee, file = "mapping_boundaries_data_by_stimuli.csv") write_csv(ki.ee, file = "mapping_boundaries_data_by_participants.csv") ``` ## Describing the participants and their regional background Number of participants by gender: ```{r} ki.ee %>% group_by(gender) %>% summarise(number = n()) %>% flextable() ``` Number of participants by education: ```{r} ki.ee %>% group_by(education) %>% summarise(number = n()) %>% flextable() ``` Plot the place of habitation of the participants (Figure 3). ```{r elukoht} set.seed(99) joonis_elukoht <- ggplot(ki.ee) + geom_polygon(data=mymap, aes(long, lat, group = group), inherit.aes = F, fill="white", col="black") + geom_jitter( aes(x = place_lon, y=place_lat), alpha = 0.4, width=0.1, height =0.1, na.rm = T) + scale_color_brewer(palette = "Dark2") + theme_map() + ylim(57.5,59.7) + xlim(21.8,28.2) + NULL joonis_elukoht ``` Save the figures for the paper. ```{r, eval=FALSE} ggsave(plot = joonis_elukoht, filename = "lippus_jt_rly2025_fig_3.pdf", device = "pdf", width = 8, height = 5, units = "in") ggsave(plot = joonis_elukoht, filename = "lippus_jt_rly2025_fig_3.png", device = "png", width = 8, height = 5, units = "in") ``` Plot the place of habitation of the participants and their parents (Figure 7). ```{r paritolu} set.seed(99) joonis_paritolu <- ki.ee %>% pivot_longer(., cols = c("mother_lon","mother_lat","place_lon","place_lat","father_lat","father_lon"), names_to = "tunnus", values_to = "väärtus") %>% mutate(pos = gsub("(.*)_(.*)", "\\2",tunnus), tunnus = gsub("(.*)_(.*)", "\\1",tunnus)) %>% pivot_wider(., names_from = "pos", values_from = "väärtus") %>% mutate(tunnus = factor(tunnus, levels = c("mother", "place", "father"))) %>% mutate(tunnus = fct_recode(tunnus, participant = "place")) %>% mutate(x = interaction(lat, lon)) %>% group_by(x) %>% mutate(freq = n()) %>% ungroup() %>% ggplot() + geom_polygon(data=mymap, aes(long, lat, group = group), inherit.aes = F, fill="white", col="black") + geom_line(aes(x = lon, y = lat, group = KI), col = "darkgray", alpha = 0.4, position = "jitter") + geom_jitter(data = ~subset(., freq > 10), aes(x = lon, y = lat, col = tunnus, shape = tunnus), alpha = 0.6, width=0.1, height =0.1, na.rm = T)+ geom_jitter(data = ~subset(., freq > 1 & freq < 10), aes(x = lon, y = lat, col = tunnus, shape = tunnus), alpha = 0.6, width=0.03, height =0.03, na.rm = T)+ geom_point(data = ~subset(., freq == 1), aes(x = lon, y = lat, col = tunnus, shape = tunnus), alpha = 0.8, na.rm = T)+ scale_color_brewer(palette = "Dark2") + scale_shape_manual(values=c(2, 16, 6))+ theme_map() + theme(legend.title=element_blank())+ ylim(57.5,59.7) + xlim(21.8,28.2) + NULL joonis_paritolu ``` ```{r, eval=F} ggsave(plot = joonis_paritolu, filename = "lippus_jt_rly2025_fig_7.png", device = "png", width = 8, height = 5, units = "in") ggsave(plot = joonis_paritolu, filename = "lippus_jt_rly2025_fig_7.pdf", device = "pdf", width = 8, height = 5, units = "in") ``` Plot the regional background of the participants by Pitch Sensitivity Score (Figure 5) ```{r dur2pich-fig} set.seed(99) joonis_dur2pich_score<-ki.ee %>% mutate(cue = c( "duration", "pitch")[2- as.numeric(ki.ee$dur2pitch_score < 0.1) ], lat = mother_lat, lon = mother_lon) %>% mutate(x = interaction(lat, lon)) %>% group_by(x) %>% mutate(freq = n()) %>% ungroup() %>% ggplot(aes(y = place_lat, x = place_lon, col = cue))+ geom_polygon(data=mymap, aes(long, lat, group = group), inherit.aes = F, fill="white", col="black") + geom_jitter(data = ~subset(., freq > 10), aes(x = lon, y = lat, col = cue, shape = cue), alpha = 0.6, width=0.1, height =0.1, na.rm = T)+ geom_jitter(data = ~subset(., freq > 1 & freq < 10), aes(x = lon, y = lat, col = cue, shape = cue), alpha = 0.6, width=0.03, height =0.03, na.rm = T)+ geom_point(data = ~subset(., freq == 1), aes(x = lon, y = lat, col = cue, shape = cue), alpha = 0.8, na.rm = T)+ scale_color_brewer(palette = "Dark2") + scale_shape_manual(values=c(2, 16, 6))+ theme_map() + theme(legend.title=element_blank())+ ylim(57.5,59.7) + xlim(21.8,28.2) + NULL joonis_dur2pich_score ``` ```{r, eval=F} ggsave(plot = joonis_dur2pich_score, filename = "lippus_jt_rly2025_fig_5.png", device = "png", width = 8, height = 5, units = "in") ggsave(plot = joonis_dur2pich_score, filename = "lippus_jt_rly2025_fig_5.pdf", device = "pdf", width = 8, height = 5, units = "in") ``` ## Analysing the effect of microprosodic variation The figure of proportion of Q3 responses by V1 duration pitch cue and phoneme sequences (Figure 4). ```{r, warning=FALSE, message=F} levels(dat.ee$base_word) <- gsub("^(.)(.)(.*)$", "\\1\\2\\2\\3",levels(dat.ee$base_word)) (joonis.main.result <- dat.ee %>% filter(manip != "f0", dur2pitch_score>0.1) %>% group_by(base_word, pitch_cue, v1dur) %>% summarise(Q3prop = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$estimate, conf.low = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$conf.int[1], conf.high = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$conf.int[2]) %>% group_by(base_word, pitch_cue) %>% mutate(alumine = approx(x = conf.low, y = v1dur, xout = 0.51)$y, ylemine = approx(x = conf.high, y = v1dur, xout = 0.49)$y) %>% mutate(alumine = ifelse(is.na(alumine) & !is.na(ylemine), 290, alumine), ylemine = ifelse(is.na(ylemine) & !is.na(alumine), 170, ylemine)) %>% ggplot(aes(y = Q3prop, x=v1dur, fill= base_word))+ geom_hline(yintercept=.5, linetype="dashed", color = "black")+ geom_line(aes(col=base_word))+ geom_text(aes(label=base_word), size = 2.5)+ scale_x_continuous(breaks = c(170,200, 230, 260, 290))+ geom_ribbon(aes(ymin= conf.low, ymax = conf.high), alpha=0.35)+ geom_rect( aes(xmin = ylemine, xmax = alumine, ymin = 0, ymax =1), alpha = 0.2, fill = "gray")+ scale_color_brewer(palette = "Dark2") + scale_fill_brewer(palette = "Dark2") + facet_grid(rows = vars(base_word), cols= vars(pitch_cue))+ labs(y = "Proportion of Q3 responses", x = "V1 duration")+ guides(fill = F, color = F)+ theme_bw()+ NULL) ``` ```{r, eval=F} ggsave(plot = joonis.main.result, filename = "lippus_jt_rly2025_fig_4.png", device = "png", width = 6, height = 6, units = "in") ggsave(plot = joonis.main.result, filename = "lippus_jt_rly2025_fig_4.pdf", device = "pdf", width = 6, height = 6, units = "in") ``` Finding the category boundary by base word ```{r, warning=FALSE, message=FALSE} dat.ee %>% filter(manip != "f0", dur2pitch_score>0.1) %>% group_by(base_word, pitch_cue, v1dur) %>% summarise(Q3prop = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$estimate, conf.low = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$conf.int[1], conf.high = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$conf.int[2]) %>% group_by(base_word, pitch_cue) %>% mutate(alumine = approx(x = conf.low, y = v1dur, xout = 0.51)$y, ylemine = approx(x = conf.high, y = v1dur, xout = 0.49)$y) %>% mutate(alumine = ifelse(is.na(alumine) & !is.na(ylemine), 290, alumine), ylemine = ifelse(is.na(ylemine) & !is.na(alumine), 170, ylemine)) %>% select(base_word, pitch_cue, alumine, ylemine) %>% distinct() -> tmp tmp[tmp$pitch_cue=="level", c("base_word", "ylemine")] %>% flextable() %>% set_caption("The lowest V1 duration from which stimuli with level pitch are perceived as Q3") tmp[tmp$pitch_cue=="fall", c("base_word", "alumine")] %>% flextable() %>% set_caption("The highest V1 duration until which stimuli with falling pitch are perceived as Q2") ``` ## Random Forest model explaining the variation of the Pitch Sensitivity Score ```{r, message=FALSE, warning=FALSE} library(randomForest) library(caTools) ``` Split the data into training and testing subsets. ```{r, eval=T} set.seed(66) split <- sample.split(ki.ee, SplitRatio = 0.8) ki_train <- subset(ki.ee, split==T) ki_test <- subset(ki.ee, split==F) ``` Run the model. ```{r} set.seed(43) mets <- randomForest(dur2pitch_score ~ age + gender + mother_lat + mother_lon + education + rhythm + hear_out_of_tune + language_skills + dialects + speechrate + speechrate_self, data = ki.ee, importance = T, na.action = na.omit) mets importance(mets, scale=F) #varImpPlot(mets, scale=F, type = 1) pred_test <- predict(mets, newdata = ki_test, type = "class") x <- (table(pred_test,ki_test$dur2pitch_score)) round((x[1,1]+x[2,2])/sum(x)*100) ``` Plot the results (Figure 6). ```{r} df <- as.data.frame(importance(mets, scale = F)) (joonis_mets <- ggplot(data = df, aes(y = reorder(rownames(df), `%IncMSE`), x = `%IncMSE`, alpha=`%IncMSE`)) + geom_bar(stat = "identity") + labs(y="")+ guides(fill = F, color = F, alpha =F)+ theme_bw()+ NULL) ``` ```{r} ggsave(plot = joonis_mets, filename = "lippus_jt_rly2025_fig_6.png", device = "png", width = 5, height = 3, units = "in") ggsave(plot = joonis_mets, filename = "lippus_jt_rly2025_fig_6.pdf", device = "pdf", width = 5, height = 3, units = "in") ``` ## Plotting the base words Read the Pitch tracks of the base words measured in Praat and plot the pitch curves (Figure 2). ```{r, message=F, warning=F} pitchtracks <- read.delim("baseword_pitch_tracks.txt", na.strings = "--undefined--") pitchtracks$segm <- factor(pitchtracks$segm); levels(pitchtracks$segm) = c("C1","V1", "C2", "V2") pitchtracks$Word <- gsub("^(.)(.)(..)$", "\\1\\2\\2\\3",pitchtracks$Word) pitchtracks$Word <- factor(pitchtracks$Word, levels = c("saada","saage", "saagi", "laadi", "liiga", "viide", "viigi", "taada")) (joonis.f0 <- pitchtracks %>% filter (Quantity != "Q1", segm %in% c("V1","V2")) %>% ggplot( aes(y=Pitch, x=time*1000, group=interaction(Word, segm), col=Word))+ geom_line(alpha = 0.7, lwd = 2)+ facet_grid(cols = vars(Quantity))+ scale_color_brewer(palette = "Dark2") + xlim(0,500) + labs(y = "Pitch (Hz)", x = "Time (ms)")+ theme_bw()+ NULL) ``` ```{r, eval=F} ggsave(plot = joonis.f0, filename = "lippus_jt_rly2025_fig_2.pdf", device = "pdf", width = 8, height = 3, units = "in") ggsave(plot = joonis.f0, filename = "lippus_jt_rly2025_fig_2.png", device = "png", width = 8, height = 3, units = "in") ``` Get the F0 value in the end of V1 ```{r} pitchtracks %>% filter(segm == "V1" & !is.na(Pitch)) %>% group_by(Word, Quantity) %>% mutate(V1_end_f0 = Pitch[which.max(time)]) %>% ungroup() %>% select(Word, Quantity, V1_end_f0) %>% distinct() %>% group_by(Quantity) %>% summarise(mean = mean(V1_end_f0)) %>% flextable() ``` Get the max f0. ```{r} pitchtracks %>% filter(segm == "V1" & !is.na(Pitch)) %>% group_by(Word, Quantity) %>% mutate(max = max(Pitch)) %>% ungroup() %>% select(Word, Quantity, max) %>% distinct() %>% group_by(Quantity) %>% summarise(mean = mean(max)) %>% flextable() ``` Plotting the duration (Figure 1) ```{r, warning=FALSE, message=F} library(rPraat) dir = "basewords/" durs <- data.frame() for(q in c("Q2","Q3")){ for(bsw in c("sada","sage", "sagi", "ladi", "liga", "vide", "vigi", "tada")){ fname = paste0(q, "_", bsw) tg = tg.read(paste0(dir, fname, ".TextGrid")) tmp <- data.frame( Quantity = q, Word = gsub("^(.)(.)(..)$", "\\1\\2\\2\\3",bsw), V1 = (tg$häälikud$t2[2] - tg$häälikud$t1[2])*1000, V2 = (tg$häälikud$t2[4] - tg$häälikud$t1[4])*1000) durs <- rbind(durs, tmp) } } durs$Word <- factor(durs$Word, levels = c("saada","saage", "saagi", "laadi", "liiga", "viide", "viigi", "taada")) (joonis.kest <- durs %>% mutate(`V1/V2 ratio` = V1/V2) %>% pivot_longer(cols = 3:5, names_to = "var", values_to = "Duration") %>% mutate_at(vars(var), funs(factor(., levels = c("V1","V2","V1/V2 ratio") ))) %>% ggplot(aes(y=Duration, x=Quantity))+ geom_boxplot()+ #facet_grid(cols = vars(var))+ facet_wrap("var", scales = "free")+ theme_bw()) ``` ```{r, eval=F} ggsave(plot = joonis.kest, filename = "lippus_jt_rly2025_fig_1.pdf", device = "pdf", width = 8, height = 3, units = "in") ggsave(plot = joonis.kest, filename = "lippus_jt_rly2025_fig_1.png", device = "png", width = 8, height = 3, units = "in") ``` This is the end of the code sheet.