This is the R code for conducting the analysis published in Lippus, Pärtel, Liis Kask, Sofia Lutter, Nele Põldver, Kairi Kreegipuu. 2025. Mapping boundaries: Microprosodic and dialectal variability in the perception of Estonian long and overlong quantity. Eesti Rakenduslingvistika Ühingu aastaraamat = Estonian Papers in Applied Linguistics 21.

Load packages and Estonian base map for plotting the geoinfo.

library(tidyverse)
library(flextable)

# aluskaart
library(rworldmap)
library(ggthemes)
data("countryExData", envir = environment(), package = "rworldmap")
mymap = joinCountryData2Map(countryExData, joinCode = "ISO3", nameJoinColumn = "ISO3V10", mapResolution = "low") %>% fortify(mymap) %>% subset(id=="Estonia")
## 149 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 95 codes from the map weren't represented in your data

The data set and summary of the factors

The data comes from an online perception study carried out in the Kaemus online experiment enviroment. The stimuli for the perception study were recorded from a female Estonian speaker and re-synthesised in Praat. The original recordings, the Praat scripts for running the manipulation of durations and f0 contours and stimulus files are included in the current repository. The perception study data has been converted from Kaemus output to Rda file, it has been manually sorted and modified (adding latitude and longitude data for the locations, for example).

Read the full dataset.

load("mapping_boundaries_dataset_full.Rda")
# remove rows with no response
dat<-uuring[!is.na(uuring$response),]
dat$q3resp <- as.numeric(dat$response=="Q3")

The table dat has the following columns:

Prepare the data for the current study

We leave out all participants who did not recognise the control stimuli with Q1 properties correctly as Q1.

valesti_vastanud_KId <- unique(dat$KI[which(dat$base_quant=="Q1" & dat$v1dur==110 & dat$response!="Q1")])
dat <- dat[!dat$KI %in% valesti_vastanud_KId,]

Leave out the trials where the responce to other stimuli was Q1.

dat <- dat[dat$v1dur!=110  & dat$response!="Q1",]

# refactor the responce to leave out Q1 which is now an empty category
dat$response <- factor(dat$response)

Add factor pitch_cue to describe the pitch contour of the stimuli. If the base word was Q2 and the stimuli are created by manipulation of durations (Sets 1 & 2) or the base word was Q3 and the stimuli are created by manipulation of f0 (Set 3), the pitch is level (in the stressed syllable). Otherwise (Q3 baseword Set 1 & 2 and Q2 baseword Set 3) the pitch is falling.

# teeb põhitooni kirjeldava faktori pitch: level/fall
dat$pitch_cue[(dat$manip == "f0" & dat$base_quant == "Q2") | (dat$manip != "f0" & dat$base_quant == "Q3")] <- "fall"
dat$pitch_cue[(dat$manip == "f0" & dat$base_quant == "Q3") | (dat$manip != "f0" & dat$base_quant != "Q3")] <- "level"
dat$pitch_cue <- factor(dat$pitch_cue, levels = c("level","fall"))

Calculate the Pitch Sensitivity Scores (dur2pitch_score).

# kestuse-vs-põhitooni skoor 
for(i in unique(dat$KI)){
  tmp <- dat[dat$KI==i & dat$base_word!="tada",]
  dat$dur2pitch_score[dat$KI==i] <- length(which( (tmp$response=="Q3" & tmp$pitch_cue=="level") | (tmp$response=="Q2" & tmp$pitch_cue=="fall") )) / nrow(tmp)
}

Split the dataset by speakers’ mother tongue. In this study we continue by analysing the Estonian L1 subset in dat.ee.

dat.ee <- dat[dat$mothertongue=="eesti",]
dat.vn <- dat[dat$mothertongue!="eesti",]

The table dat.ee has the data by stimuli. Now we create a table ki.ee with data by participants.

dat.ee %>% 
  distinct(KI, age, gender, mother_place, mother_lat, mother_lon, place, place_lat, place_lon, father_lat, father_lon, education, dur2pitch_score, rhythm, hear_out_of_tune, language_skills, dialects,speechrate, speechrate_self) -> ki.ee

Save the subset tables as one Rda file and also separately as CSV files.

save(file="mapping_boundaries_dataset_filtered.Rda", dat.ee, ki.ee)
write_csv(dat.ee, file = "mapping_boundaries_data_by_stimuli.csv")
write_csv(ki.ee, file = "mapping_boundaries_data_by_participants.csv")

Describing the participants and their regional background

Number of participants by gender:

ki.ee %>% 
  group_by(gender) %>% 
  summarise(number = n()) %>% 
  flextable()

gender

number

F

250

M

40

Number of participants by education:

ki.ee %>% 
  group_by(education) %>% 
  summarise(number = n()) %>% 
  flextable()

education

number

kesk

86

kesk-eri

16

BA

66

MA

98

PhD

21

3

Plot the place of habitation of the participants (Figure 3).

set.seed(99)
joonis_elukoht <- ggplot(ki.ee) +
  geom_polygon(data=mymap, aes(long, lat, group = group), inherit.aes = F, fill="white", col="black") +
  geom_jitter( aes(x = place_lon, y=place_lat), alpha = 0.4, width=0.1, height =0.1, na.rm = T) + 
  scale_color_brewer(palette = "Dark2") +
  theme_map() +
  ylim(57.5,59.7) + xlim(21.8,28.2) +
  NULL
joonis_elukoht

Save the figures for the paper.

ggsave(plot = joonis_elukoht, filename = "lippus_jt_rly2025_fig_3.pdf", device = "pdf", width = 8, height = 5, units = "in")
ggsave(plot = joonis_elukoht, filename = "lippus_jt_rly2025_fig_3.png", device = "png", width = 8, height = 5, units = "in")

Plot the place of habitation of the participants and their parents (Figure 7).

set.seed(99)
joonis_paritolu <- ki.ee %>% 
  pivot_longer(., cols = c("mother_lon","mother_lat","place_lon","place_lat","father_lat","father_lon"), names_to = "tunnus", values_to = "väärtus") %>% 
  mutate(pos = gsub("(.*)_(.*)", "\\2",tunnus), tunnus = gsub("(.*)_(.*)", "\\1",tunnus)) %>% 
  pivot_wider(., names_from = "pos", values_from = "väärtus") %>% 
  mutate(tunnus = factor(tunnus, levels = c("mother", "place", "father")))  %>% 
  mutate(tunnus = fct_recode(tunnus, participant = "place")) %>% 
  mutate(x = interaction(lat, lon)) %>% 
  group_by(x) %>% mutate(freq = n()) %>% ungroup()  %>% 
ggplot() +
  geom_polygon(data=mymap, aes(long, lat, group = group), inherit.aes = F, fill="white", col="black") +
  geom_line(aes(x = lon, y = lat,  group = KI), col = "darkgray", alpha = 0.4, position = "jitter") +
  geom_jitter(data = ~subset(., freq > 10), aes(x = lon, y = lat, col = tunnus, shape = tunnus), alpha = 0.6, width=0.1, height =0.1, na.rm = T)+
  geom_jitter(data = ~subset(., freq > 1 & freq < 10), aes(x = lon, y = lat, col = tunnus, shape = tunnus), alpha = 0.6, width=0.03, height =0.03, na.rm = T)+
  geom_point(data = ~subset(., freq == 1), aes(x = lon, y = lat, col = tunnus, shape = tunnus), alpha = 0.8, na.rm = T)+
  scale_color_brewer(palette = "Dark2") +
  scale_shape_manual(values=c(2, 16, 6))+
  theme_map() +
  theme(legend.title=element_blank())+
  ylim(57.5,59.7) + xlim(21.8,28.2) +
  NULL
joonis_paritolu
## Warning: Removed 26 rows containing missing values (`geom_line()`).

ggsave(plot = joonis_paritolu, filename = "lippus_jt_rly2025_fig_7.png", device = "png", width = 8, height = 5, units = "in")
ggsave(plot = joonis_paritolu, filename = "lippus_jt_rly2025_fig_7.pdf", device = "pdf", width = 8, height = 5, units = "in")

Plot the regional background of the participants by Pitch Sensitivity Score (Figure 5)

set.seed(99)
joonis_dur2pich_score<-ki.ee %>% 
  mutate(cue = c( "duration", "pitch")[2- as.numeric(ki.ee$dur2pitch_score < 0.1) ], lat = mother_lat, lon = mother_lon) %>% 
  mutate(x = interaction(lat, lon)) %>% 
  group_by(x) %>% mutate(freq = n()) %>% ungroup() %>% 
  ggplot(aes(y = place_lat, x = place_lon, col = cue))+
  geom_polygon(data=mymap, aes(long, lat, group = group), inherit.aes = F, fill="white", col="black") +
  geom_jitter(data = ~subset(., freq > 10), aes(x = lon, y = lat, col = cue, shape = cue), alpha = 0.6, width=0.1, height =0.1, na.rm = T)+
  geom_jitter(data = ~subset(., freq > 1 & freq < 10), aes(x = lon, y = lat, col = cue, shape = cue), alpha = 0.6, width=0.03, height =0.03, na.rm = T)+
  geom_point(data = ~subset(., freq == 1), aes(x = lon, y = lat, col = cue, shape = cue), alpha = 0.8, na.rm = T)+
  scale_color_brewer(palette = "Dark2") +
  scale_shape_manual(values=c(2, 16, 6))+
  theme_map() +
  theme(legend.title=element_blank())+
  ylim(57.5,59.7) + xlim(21.8,28.2) +
  NULL
joonis_dur2pich_score

ggsave(plot = joonis_dur2pich_score, filename = "lippus_jt_rly2025_fig_5.png", device = "png", width = 8, height = 5, units = "in")
ggsave(plot = joonis_dur2pich_score, filename = "lippus_jt_rly2025_fig_5.pdf", device = "pdf", width = 8, height = 5, units = "in")

Analysing the effect of microprosodic variation

The figure of proportion of Q3 responses by V1 duration pitch cue and phoneme sequences (Figure 4).

levels(dat.ee$base_word) <- gsub("^(.)(.)(.*)$", "\\1\\2\\2\\3",levels(dat.ee$base_word))
(joonis.main.result <- dat.ee %>% 
  filter(manip != "f0", dur2pitch_score>0.1) %>% 
  group_by(base_word, pitch_cue, v1dur) %>% 
  summarise(Q3prop = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$estimate,
            conf.low = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$conf.int[1],
            conf.high = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$conf.int[2]) %>%
  group_by(base_word, pitch_cue) %>% 
  mutate(alumine = approx(x = conf.low, y = v1dur, xout = 0.51)$y, ylemine = approx(x = conf.high, y = v1dur, xout = 0.49)$y) %>% 
  mutate(alumine = ifelse(is.na(alumine) & !is.na(ylemine), 290, alumine), ylemine = ifelse(is.na(ylemine) & !is.na(alumine), 170, ylemine)) %>% 
ggplot(aes(y = Q3prop, x=v1dur, fill= base_word))+
  geom_hline(yintercept=.5, linetype="dashed", color = "black")+
  geom_line(aes(col=base_word))+
  geom_text(aes(label=base_word), size = 2.5)+
  scale_x_continuous(breaks = c(170,200, 230, 260, 290))+
  geom_ribbon(aes(ymin= conf.low, ymax = conf.high), alpha=0.35)+
  geom_rect( aes(xmin = ylemine, xmax = alumine, ymin = 0, ymax =1), alpha = 0.2, fill = "gray")+
  scale_color_brewer(palette = "Dark2") +
  scale_fill_brewer(palette = "Dark2") +
  facet_grid(rows = vars(base_word), cols= vars(pitch_cue))+
  labs(y = "Proportion of Q3 responses", x = "V1 duration")+
  guides(fill = F, color = F)+
  theme_bw()+
NULL)

ggsave(plot = joonis.main.result, filename = "lippus_jt_rly2025_fig_4.png", device = "png", width = 6, height = 6, units = "in")
ggsave(plot = joonis.main.result, filename = "lippus_jt_rly2025_fig_4.pdf", device = "pdf", width = 6, height = 6, units = "in")

Finding the category boundary by base word

dat.ee %>% 
  filter(manip != "f0", dur2pitch_score>0.1) %>% 
  group_by(base_word, pitch_cue, v1dur) %>% 
  summarise(Q3prop = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$estimate,
            conf.low = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$conf.int[1],
            conf.high = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$conf.int[2]) %>%
  group_by(base_word, pitch_cue) %>% 
  mutate(alumine = approx(x = conf.low, y = v1dur, xout = 0.51)$y, ylemine = approx(x = conf.high, y = v1dur, xout = 0.49)$y) %>% 
  mutate(alumine = ifelse(is.na(alumine) & !is.na(ylemine), 290, alumine), ylemine = ifelse(is.na(ylemine) & !is.na(alumine), 170, ylemine)) %>% 
  select(base_word, pitch_cue, alumine, ylemine) %>% 
  distinct() -> tmp

tmp[tmp$pitch_cue=="level", c("base_word", "ylemine")] %>% 
  flextable() %>% 
  set_caption("The lowest V1 duration from which stimuli with level pitch are perceived as Q3")
The lowest V1 duration from which stimuli with level pitch are perceived as Q3

base_word

ylemine

saada

286.6925

saage

267.9562

saagi

laadi

286.9299

liiga

244.8272

viide

272.1807

viigi

274.6668

taada

283.4490

tmp[tmp$pitch_cue=="fall", c("base_word", "alumine")] %>% 
  flextable() %>% 
  set_caption("The highest V1 duration until which stimuli with falling pitch are perceived as Q2")
The highest V1 duration until which stimuli with falling pitch are perceived as Q2

base_word

alumine

saada

209.4782

saage

222.6034

saagi

233.9322

laadi

215.5374

liiga

236.0390

viide

207.4542

viigi

213.6724

taada

290.0000

Random Forest model explaining the variation of the Pitch Sensitivity Score

library(randomForest)
library(caTools)

Split the data into training and testing subsets.

set.seed(66)
split <- sample.split(ki.ee, SplitRatio = 0.8)
ki_train <- subset(ki.ee, split==T) 
ki_test <- subset(ki.ee, split==F)

Run the model.

set.seed(43)
mets <- randomForest(dur2pitch_score ~ age + gender + mother_lat + mother_lon + education + rhythm + hear_out_of_tune + language_skills + dialects + speechrate + speechrate_self, data = ki.ee, importance = T, na.action = na.omit)

mets
## 
## Call:
##  randomForest(formula = dur2pitch_score ~ age + gender + mother_lat +      mother_lon + education + rhythm + hear_out_of_tune + language_skills +      dialects + speechrate + speechrate_self, data = ki.ee, importance = T,      na.action = na.omit) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 0.02606669
##                     % Var explained: 13.02
importance(mets, scale=F)
##                        %IncMSE IncNodePurity
## age               2.444087e-03     1.2181820
## gender           -9.592358e-05     0.1335064
## mother_lat        3.920165e-04     0.9430299
## mother_lon        1.786110e-03     1.1320883
## education         2.219690e-03     0.5784624
## rhythm            2.683659e-03     0.9143883
## hear_out_of_tune  1.157358e-03     0.3846429
## language_skills   4.147133e-04     0.3524418
## dialects         -4.822263e-05     0.4015656
## speechrate        4.504805e-04     0.5645336
## speechrate_self   7.418151e-04     0.4927696
#varImpPlot(mets, scale=F, type = 1)

pred_test <- predict(mets, newdata = ki_test, type = "class")
x <- (table(pred_test,ki_test$dur2pitch_score))
round((x[1,1]+x[2,2])/sum(x)*100)
## [1] 3

Plot the results (Figure 6).

df <- as.data.frame(importance(mets, scale = F))
(joonis_mets <- ggplot(data = df, aes(y = reorder(rownames(df), `%IncMSE`), x = `%IncMSE`, alpha=`%IncMSE`)) +
  geom_bar(stat = "identity") +
  labs(y="")+
  guides(fill = F, color = F, alpha =F)+
  theme_bw()+
  NULL)

ggsave(plot = joonis_mets, filename = "lippus_jt_rly2025_fig_6.png", device = "png", width = 5, height = 3, units = "in")
ggsave(plot = joonis_mets, filename = "lippus_jt_rly2025_fig_6.pdf", device = "pdf", width = 5, height = 3, units = "in")

Plotting the base words

Read the Pitch tracks of the base words measured in Praat and plot the pitch curves (Figure 2).

pitchtracks <- read.delim("baseword_pitch_tracks.txt", na.strings = "--undefined--")
pitchtracks$segm <- factor(pitchtracks$segm); levels(pitchtracks$segm) = c("C1","V1", "C2", "V2")
pitchtracks$Word <- gsub("^(.)(.)(..)$", "\\1\\2\\2\\3",pitchtracks$Word)
pitchtracks$Word <- factor(pitchtracks$Word, levels = c("saada","saage", "saagi", "laadi", "liiga", "viide", "viigi", "taada"))

(joonis.f0 <- pitchtracks %>% 
  filter (Quantity != "Q1", segm %in% c("V1","V2")) %>% 
  ggplot( aes(y=Pitch, x=time*1000, group=interaction(Word, segm), col=Word))+
  geom_line(alpha = 0.7, lwd = 2)+
  facet_grid(cols = vars(Quantity))+
  scale_color_brewer(palette = "Dark2") +
  xlim(0,500) +
  labs(y = "Pitch (Hz)", x = "Time (ms)")+
  theme_bw()+
  NULL)

ggsave(plot = joonis.f0, filename = "lippus_jt_rly2025_fig_2.pdf", device = "pdf", width = 8, height = 3, units = "in")
ggsave(plot = joonis.f0, filename = "lippus_jt_rly2025_fig_2.png", device = "png", width = 8, height = 3, units = "in")

Get the F0 value in the end of V1

pitchtracks %>% 
  filter(segm == "V1" & !is.na(Pitch)) %>% 
  group_by(Word, Quantity) %>%
  mutate(V1_end_f0 = Pitch[which.max(time)]) %>%
  ungroup() %>% 
  select(Word, Quantity, V1_end_f0) %>% 
  distinct() %>% 
  group_by(Quantity) %>% 
  summarise(mean = mean(V1_end_f0)) %>% 
  flextable()

Quantity

mean

Q1

186.3298

Q2

181.3226

Q3

156.7389

Get the max f0.

pitchtracks %>% 
  filter(segm == "V1" & !is.na(Pitch)) %>% 
  group_by(Word, Quantity) %>%
  mutate(max = max(Pitch)) %>%
  ungroup() %>% 
  select(Word, Quantity, max) %>% 
  distinct() %>% 
  group_by(Quantity) %>% 
  summarise(mean = mean(max)) %>% 
  flextable()

Quantity

mean

Q1

192.9386

Q2

189.4437

Q3

197.8732

Plotting the duration (Figure 1)

library(rPraat)

dir = "basewords/"
durs <- data.frame()
for(q in c("Q2","Q3")){
  for(bsw in c("sada","sage", "sagi", "ladi", "liga", "vide", "vigi", "tada")){
    fname = paste0(q, "_", bsw)
    tg = tg.read(paste0(dir, fname, ".TextGrid"))
    tmp <- data.frame(
    Quantity = q,
    Word = gsub("^(.)(.)(..)$", "\\1\\2\\2\\3",bsw),
    V1 = (tg$häälikud$t2[2] - tg$häälikud$t1[2])*1000,
    V2 = (tg$häälikud$t2[4] - tg$häälikud$t1[4])*1000)
    durs <- rbind(durs, tmp)
  }
}
durs$Word <- factor(durs$Word, levels = c("saada","saage", "saagi", "laadi", "liiga", "viide", "viigi", "taada"))

(joonis.kest <- durs %>% 
  mutate(`V1/V2 ratio` = V1/V2) %>% 
  pivot_longer(cols = 3:5, names_to = "var", values_to = "Duration") %>% 
  mutate_at(vars(var), funs(factor(., levels = c("V1","V2","V1/V2 ratio") ))) %>% 
  ggplot(aes(y=Duration, x=Quantity))+
  geom_boxplot()+
  #facet_grid(cols = vars(var))+
  facet_wrap("var", scales = "free")+
  theme_bw())

ggsave(plot = joonis.kest, filename = "lippus_jt_rly2025_fig_1.pdf", device = "pdf", width = 8, height = 3, units = "in")
ggsave(plot = joonis.kest, filename = "lippus_jt_rly2025_fig_1.png", device = "png", width = 8, height = 3, units = "in")

This is the end of the code sheet.