This is the R code for conducting the analysis published in Lippus, Pärtel, Liis Kask, Sofia Lutter, Nele Põldver, Kairi Kreegipuu. 2025. Mapping boundaries: Microprosodic and dialectal variability in the perception of Estonian long and overlong quantity. Eesti Rakenduslingvistika Ühingu aastaraamat = Estonian Papers in Applied Linguistics 21.
Load packages and Estonian base map for plotting the geoinfo.
library(tidyverse)
library(flextable)
# aluskaart
library(rworldmap)
library(ggthemes)
data("countryExData", envir = environment(), package = "rworldmap")
mymap = joinCountryData2Map(countryExData, joinCode = "ISO3", nameJoinColumn = "ISO3V10", mapResolution = "low") %>% fortify(mymap) %>% subset(id=="Estonia")
## 149 codes from your data successfully matched countries in the map
## 0 codes from your data failed to match with a country code in the map
## 95 codes from the map weren't represented in your data
The data comes from an online perception study carried out in the Kaemus online experiment enviroment. The stimuli for the perception study were recorded from a female Estonian speaker and re-synthesised in Praat. The original recordings, the Praat scripts for running the manipulation of durations and f0 contours and stimulus files are included in the current repository. The perception study data has been converted from Kaemus output to Rda file, it has been manually sorted and modified (adding latitude and longitude data for the locations, for example).
Read the full dataset.
load("mapping_boundaries_dataset_full.Rda")
# remove rows with no response
dat<-uuring[!is.na(uuring$response),]
dat$q3resp <- as.numeric(dat$response=="Q3")
The table dat
has the following columns:
We leave out all participants who did not recognise the control stimuli with Q1 properties correctly as Q1.
valesti_vastanud_KId <- unique(dat$KI[which(dat$base_quant=="Q1" & dat$v1dur==110 & dat$response!="Q1")])
dat <- dat[!dat$KI %in% valesti_vastanud_KId,]
Leave out the trials where the responce to other stimuli was Q1.
dat <- dat[dat$v1dur!=110 & dat$response!="Q1",]
# refactor the responce to leave out Q1 which is now an empty category
dat$response <- factor(dat$response)
Add factor pitch_cue
to describe the pitch contour of
the stimuli. If the base word was Q2 and the stimuli are created by
manipulation of durations (Sets 1 & 2) or the base word was Q3 and
the stimuli are created by manipulation of f0 (Set 3), the pitch is
level (in the stressed syllable). Otherwise (Q3 baseword Set 1 & 2
and Q2 baseword Set 3) the pitch is falling.
# teeb põhitooni kirjeldava faktori pitch: level/fall
dat$pitch_cue[(dat$manip == "f0" & dat$base_quant == "Q2") | (dat$manip != "f0" & dat$base_quant == "Q3")] <- "fall"
dat$pitch_cue[(dat$manip == "f0" & dat$base_quant == "Q3") | (dat$manip != "f0" & dat$base_quant != "Q3")] <- "level"
dat$pitch_cue <- factor(dat$pitch_cue, levels = c("level","fall"))
Calculate the Pitch Sensitivity Scores
(dur2pitch_score
).
# kestuse-vs-põhitooni skoor
for(i in unique(dat$KI)){
tmp <- dat[dat$KI==i & dat$base_word!="tada",]
dat$dur2pitch_score[dat$KI==i] <- length(which( (tmp$response=="Q3" & tmp$pitch_cue=="level") | (tmp$response=="Q2" & tmp$pitch_cue=="fall") )) / nrow(tmp)
}
Split the dataset by speakers’ mother tongue. In this study we
continue by analysing the Estonian L1 subset in
dat.ee
.
dat.ee <- dat[dat$mothertongue=="eesti",]
dat.vn <- dat[dat$mothertongue!="eesti",]
The table dat.ee
has the data by stimuli. Now we create
a table ki.ee
with data by
participants.
dat.ee %>%
distinct(KI, age, gender, mother_place, mother_lat, mother_lon, place, place_lat, place_lon, father_lat, father_lon, education, dur2pitch_score, rhythm, hear_out_of_tune, language_skills, dialects,speechrate, speechrate_self) -> ki.ee
Save the subset tables as one Rda file and also separately as CSV files.
save(file="mapping_boundaries_dataset_filtered.Rda", dat.ee, ki.ee)
write_csv(dat.ee, file = "mapping_boundaries_data_by_stimuli.csv")
write_csv(ki.ee, file = "mapping_boundaries_data_by_participants.csv")
Number of participants by gender:
ki.ee %>%
group_by(gender) %>%
summarise(number = n()) %>%
flextable()
gender | number |
---|---|
F | 250 |
M | 40 |
Number of participants by education:
ki.ee %>%
group_by(education) %>%
summarise(number = n()) %>%
flextable()
education | number |
---|---|
kesk | 86 |
kesk-eri | 16 |
BA | 66 |
MA | 98 |
PhD | 21 |
3 |
Plot the place of habitation of the participants (Figure 3).
set.seed(99)
joonis_elukoht <- ggplot(ki.ee) +
geom_polygon(data=mymap, aes(long, lat, group = group), inherit.aes = F, fill="white", col="black") +
geom_jitter( aes(x = place_lon, y=place_lat), alpha = 0.4, width=0.1, height =0.1, na.rm = T) +
scale_color_brewer(palette = "Dark2") +
theme_map() +
ylim(57.5,59.7) + xlim(21.8,28.2) +
NULL
joonis_elukoht
Save the figures for the paper.
ggsave(plot = joonis_elukoht, filename = "lippus_jt_rly2025_fig_3.pdf", device = "pdf", width = 8, height = 5, units = "in")
ggsave(plot = joonis_elukoht, filename = "lippus_jt_rly2025_fig_3.png", device = "png", width = 8, height = 5, units = "in")
Plot the place of habitation of the participants and their parents (Figure 7).
set.seed(99)
joonis_paritolu <- ki.ee %>%
pivot_longer(., cols = c("mother_lon","mother_lat","place_lon","place_lat","father_lat","father_lon"), names_to = "tunnus", values_to = "väärtus") %>%
mutate(pos = gsub("(.*)_(.*)", "\\2",tunnus), tunnus = gsub("(.*)_(.*)", "\\1",tunnus)) %>%
pivot_wider(., names_from = "pos", values_from = "väärtus") %>%
mutate(tunnus = factor(tunnus, levels = c("mother", "place", "father"))) %>%
mutate(tunnus = fct_recode(tunnus, participant = "place")) %>%
mutate(x = interaction(lat, lon)) %>%
group_by(x) %>% mutate(freq = n()) %>% ungroup() %>%
ggplot() +
geom_polygon(data=mymap, aes(long, lat, group = group), inherit.aes = F, fill="white", col="black") +
geom_line(aes(x = lon, y = lat, group = KI), col = "darkgray", alpha = 0.4, position = "jitter") +
geom_jitter(data = ~subset(., freq > 10), aes(x = lon, y = lat, col = tunnus, shape = tunnus), alpha = 0.6, width=0.1, height =0.1, na.rm = T)+
geom_jitter(data = ~subset(., freq > 1 & freq < 10), aes(x = lon, y = lat, col = tunnus, shape = tunnus), alpha = 0.6, width=0.03, height =0.03, na.rm = T)+
geom_point(data = ~subset(., freq == 1), aes(x = lon, y = lat, col = tunnus, shape = tunnus), alpha = 0.8, na.rm = T)+
scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values=c(2, 16, 6))+
theme_map() +
theme(legend.title=element_blank())+
ylim(57.5,59.7) + xlim(21.8,28.2) +
NULL
joonis_paritolu
## Warning: Removed 26 rows containing missing values (`geom_line()`).
ggsave(plot = joonis_paritolu, filename = "lippus_jt_rly2025_fig_7.png", device = "png", width = 8, height = 5, units = "in")
ggsave(plot = joonis_paritolu, filename = "lippus_jt_rly2025_fig_7.pdf", device = "pdf", width = 8, height = 5, units = "in")
Plot the regional background of the participants by Pitch Sensitivity Score (Figure 5)
set.seed(99)
joonis_dur2pich_score<-ki.ee %>%
mutate(cue = c( "duration", "pitch")[2- as.numeric(ki.ee$dur2pitch_score < 0.1) ], lat = mother_lat, lon = mother_lon) %>%
mutate(x = interaction(lat, lon)) %>%
group_by(x) %>% mutate(freq = n()) %>% ungroup() %>%
ggplot(aes(y = place_lat, x = place_lon, col = cue))+
geom_polygon(data=mymap, aes(long, lat, group = group), inherit.aes = F, fill="white", col="black") +
geom_jitter(data = ~subset(., freq > 10), aes(x = lon, y = lat, col = cue, shape = cue), alpha = 0.6, width=0.1, height =0.1, na.rm = T)+
geom_jitter(data = ~subset(., freq > 1 & freq < 10), aes(x = lon, y = lat, col = cue, shape = cue), alpha = 0.6, width=0.03, height =0.03, na.rm = T)+
geom_point(data = ~subset(., freq == 1), aes(x = lon, y = lat, col = cue, shape = cue), alpha = 0.8, na.rm = T)+
scale_color_brewer(palette = "Dark2") +
scale_shape_manual(values=c(2, 16, 6))+
theme_map() +
theme(legend.title=element_blank())+
ylim(57.5,59.7) + xlim(21.8,28.2) +
NULL
joonis_dur2pich_score
ggsave(plot = joonis_dur2pich_score, filename = "lippus_jt_rly2025_fig_5.png", device = "png", width = 8, height = 5, units = "in")
ggsave(plot = joonis_dur2pich_score, filename = "lippus_jt_rly2025_fig_5.pdf", device = "pdf", width = 8, height = 5, units = "in")
The figure of proportion of Q3 responses by V1 duration pitch cue and phoneme sequences (Figure 4).
levels(dat.ee$base_word) <- gsub("^(.)(.)(.*)$", "\\1\\2\\2\\3",levels(dat.ee$base_word))
(joonis.main.result <- dat.ee %>%
filter(manip != "f0", dur2pitch_score>0.1) %>%
group_by(base_word, pitch_cue, v1dur) %>%
summarise(Q3prop = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$estimate,
conf.low = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$conf.int[1],
conf.high = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$conf.int[2]) %>%
group_by(base_word, pitch_cue) %>%
mutate(alumine = approx(x = conf.low, y = v1dur, xout = 0.51)$y, ylemine = approx(x = conf.high, y = v1dur, xout = 0.49)$y) %>%
mutate(alumine = ifelse(is.na(alumine) & !is.na(ylemine), 290, alumine), ylemine = ifelse(is.na(ylemine) & !is.na(alumine), 170, ylemine)) %>%
ggplot(aes(y = Q3prop, x=v1dur, fill= base_word))+
geom_hline(yintercept=.5, linetype="dashed", color = "black")+
geom_line(aes(col=base_word))+
geom_text(aes(label=base_word), size = 2.5)+
scale_x_continuous(breaks = c(170,200, 230, 260, 290))+
geom_ribbon(aes(ymin= conf.low, ymax = conf.high), alpha=0.35)+
geom_rect( aes(xmin = ylemine, xmax = alumine, ymin = 0, ymax =1), alpha = 0.2, fill = "gray")+
scale_color_brewer(palette = "Dark2") +
scale_fill_brewer(palette = "Dark2") +
facet_grid(rows = vars(base_word), cols= vars(pitch_cue))+
labs(y = "Proportion of Q3 responses", x = "V1 duration")+
guides(fill = F, color = F)+
theme_bw()+
NULL)
ggsave(plot = joonis.main.result, filename = "lippus_jt_rly2025_fig_4.png", device = "png", width = 6, height = 6, units = "in")
ggsave(plot = joonis.main.result, filename = "lippus_jt_rly2025_fig_4.pdf", device = "pdf", width = 6, height = 6, units = "in")
Finding the category boundary by base word
dat.ee %>%
filter(manip != "f0", dur2pitch_score>0.1) %>%
group_by(base_word, pitch_cue, v1dur) %>%
summarise(Q3prop = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$estimate,
conf.low = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$conf.int[1],
conf.high = poisson.test(x=sum(q3resp), T=n(), conf.level = 0.99)$conf.int[2]) %>%
group_by(base_word, pitch_cue) %>%
mutate(alumine = approx(x = conf.low, y = v1dur, xout = 0.51)$y, ylemine = approx(x = conf.high, y = v1dur, xout = 0.49)$y) %>%
mutate(alumine = ifelse(is.na(alumine) & !is.na(ylemine), 290, alumine), ylemine = ifelse(is.na(ylemine) & !is.na(alumine), 170, ylemine)) %>%
select(base_word, pitch_cue, alumine, ylemine) %>%
distinct() -> tmp
tmp[tmp$pitch_cue=="level", c("base_word", "ylemine")] %>%
flextable() %>%
set_caption("The lowest V1 duration from which stimuli with level pitch are perceived as Q3")
base_word | ylemine |
---|---|
saada | 286.6925 |
saage | 267.9562 |
saagi | |
laadi | 286.9299 |
liiga | 244.8272 |
viide | 272.1807 |
viigi | 274.6668 |
taada | 283.4490 |
tmp[tmp$pitch_cue=="fall", c("base_word", "alumine")] %>%
flextable() %>%
set_caption("The highest V1 duration until which stimuli with falling pitch are perceived as Q2")
base_word | alumine |
---|---|
saada | 209.4782 |
saage | 222.6034 |
saagi | 233.9322 |
laadi | 215.5374 |
liiga | 236.0390 |
viide | 207.4542 |
viigi | 213.6724 |
taada | 290.0000 |
library(randomForest)
library(caTools)
Split the data into training and testing subsets.
set.seed(66)
split <- sample.split(ki.ee, SplitRatio = 0.8)
ki_train <- subset(ki.ee, split==T)
ki_test <- subset(ki.ee, split==F)
Run the model.
set.seed(43)
mets <- randomForest(dur2pitch_score ~ age + gender + mother_lat + mother_lon + education + rhythm + hear_out_of_tune + language_skills + dialects + speechrate + speechrate_self, data = ki.ee, importance = T, na.action = na.omit)
mets
##
## Call:
## randomForest(formula = dur2pitch_score ~ age + gender + mother_lat + mother_lon + education + rhythm + hear_out_of_tune + language_skills + dialects + speechrate + speechrate_self, data = ki.ee, importance = T, na.action = na.omit)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.02606669
## % Var explained: 13.02
importance(mets, scale=F)
## %IncMSE IncNodePurity
## age 2.444087e-03 1.2181820
## gender -9.592358e-05 0.1335064
## mother_lat 3.920165e-04 0.9430299
## mother_lon 1.786110e-03 1.1320883
## education 2.219690e-03 0.5784624
## rhythm 2.683659e-03 0.9143883
## hear_out_of_tune 1.157358e-03 0.3846429
## language_skills 4.147133e-04 0.3524418
## dialects -4.822263e-05 0.4015656
## speechrate 4.504805e-04 0.5645336
## speechrate_self 7.418151e-04 0.4927696
#varImpPlot(mets, scale=F, type = 1)
pred_test <- predict(mets, newdata = ki_test, type = "class")
x <- (table(pred_test,ki_test$dur2pitch_score))
round((x[1,1]+x[2,2])/sum(x)*100)
## [1] 3
Plot the results (Figure 6).
df <- as.data.frame(importance(mets, scale = F))
(joonis_mets <- ggplot(data = df, aes(y = reorder(rownames(df), `%IncMSE`), x = `%IncMSE`, alpha=`%IncMSE`)) +
geom_bar(stat = "identity") +
labs(y="")+
guides(fill = F, color = F, alpha =F)+
theme_bw()+
NULL)
ggsave(plot = joonis_mets, filename = "lippus_jt_rly2025_fig_6.png", device = "png", width = 5, height = 3, units = "in")
ggsave(plot = joonis_mets, filename = "lippus_jt_rly2025_fig_6.pdf", device = "pdf", width = 5, height = 3, units = "in")
Read the Pitch tracks of the base words measured in Praat and plot the pitch curves (Figure 2).
pitchtracks <- read.delim("baseword_pitch_tracks.txt", na.strings = "--undefined--")
pitchtracks$segm <- factor(pitchtracks$segm); levels(pitchtracks$segm) = c("C1","V1", "C2", "V2")
pitchtracks$Word <- gsub("^(.)(.)(..)$", "\\1\\2\\2\\3",pitchtracks$Word)
pitchtracks$Word <- factor(pitchtracks$Word, levels = c("saada","saage", "saagi", "laadi", "liiga", "viide", "viigi", "taada"))
(joonis.f0 <- pitchtracks %>%
filter (Quantity != "Q1", segm %in% c("V1","V2")) %>%
ggplot( aes(y=Pitch, x=time*1000, group=interaction(Word, segm), col=Word))+
geom_line(alpha = 0.7, lwd = 2)+
facet_grid(cols = vars(Quantity))+
scale_color_brewer(palette = "Dark2") +
xlim(0,500) +
labs(y = "Pitch (Hz)", x = "Time (ms)")+
theme_bw()+
NULL)
ggsave(plot = joonis.f0, filename = "lippus_jt_rly2025_fig_2.pdf", device = "pdf", width = 8, height = 3, units = "in")
ggsave(plot = joonis.f0, filename = "lippus_jt_rly2025_fig_2.png", device = "png", width = 8, height = 3, units = "in")
Get the F0 value in the end of V1
pitchtracks %>%
filter(segm == "V1" & !is.na(Pitch)) %>%
group_by(Word, Quantity) %>%
mutate(V1_end_f0 = Pitch[which.max(time)]) %>%
ungroup() %>%
select(Word, Quantity, V1_end_f0) %>%
distinct() %>%
group_by(Quantity) %>%
summarise(mean = mean(V1_end_f0)) %>%
flextable()
Quantity | mean |
---|---|
Q1 | 186.3298 |
Q2 | 181.3226 |
Q3 | 156.7389 |
Get the max f0.
pitchtracks %>%
filter(segm == "V1" & !is.na(Pitch)) %>%
group_by(Word, Quantity) %>%
mutate(max = max(Pitch)) %>%
ungroup() %>%
select(Word, Quantity, max) %>%
distinct() %>%
group_by(Quantity) %>%
summarise(mean = mean(max)) %>%
flextable()
Quantity | mean |
---|---|
Q1 | 192.9386 |
Q2 | 189.4437 |
Q3 | 197.8732 |
Plotting the duration (Figure 1)
library(rPraat)
dir = "basewords/"
durs <- data.frame()
for(q in c("Q2","Q3")){
for(bsw in c("sada","sage", "sagi", "ladi", "liga", "vide", "vigi", "tada")){
fname = paste0(q, "_", bsw)
tg = tg.read(paste0(dir, fname, ".TextGrid"))
tmp <- data.frame(
Quantity = q,
Word = gsub("^(.)(.)(..)$", "\\1\\2\\2\\3",bsw),
V1 = (tg$häälikud$t2[2] - tg$häälikud$t1[2])*1000,
V2 = (tg$häälikud$t2[4] - tg$häälikud$t1[4])*1000)
durs <- rbind(durs, tmp)
}
}
durs$Word <- factor(durs$Word, levels = c("saada","saage", "saagi", "laadi", "liiga", "viide", "viigi", "taada"))
(joonis.kest <- durs %>%
mutate(`V1/V2 ratio` = V1/V2) %>%
pivot_longer(cols = 3:5, names_to = "var", values_to = "Duration") %>%
mutate_at(vars(var), funs(factor(., levels = c("V1","V2","V1/V2 ratio") ))) %>%
ggplot(aes(y=Duration, x=Quantity))+
geom_boxplot()+
#facet_grid(cols = vars(var))+
facet_wrap("var", scales = "free")+
theme_bw())
ggsave(plot = joonis.kest, filename = "lippus_jt_rly2025_fig_1.pdf", device = "pdf", width = 8, height = 3, units = "in")
ggsave(plot = joonis.kest, filename = "lippus_jt_rly2025_fig_1.png", device = "png", width = 8, height = 3, units = "in")
This is the end of the code sheet.