# This is the code for runing the analysis published in # Lippus, Pärtel & Eva Liina Asu. 2019. The timing and quality of diphthong components in spontaneous Estonian. Sasha Calhoun, Paola Escudero, Marija Tabain & Paul Warren (toim), Proceedings of the 19th International Congress of Phonetic Sciences, Melbourne, Australia 2019, 1139–1143. Canberra, Australia: Australasian Speech Science and Technology Association Inc. https://www.internationalphoneticassociation.org/icphs-proceedings/ICPhS2019/papers/ICPhS_1188.pdf. # This R script file has been revisited for a very light cleanup and commenting 17.10.2025. # # CC by, Pärtel Lippus 2019 # read the data dat <- read.delim("lippus_asu_icphs2019_diphthongs_2018-11-21.txt", stringsAsFactors = T) ## setting some factor levels # fix the order of SAMPA dat$vok1.1 <- factor(dat$vok1.1, levels=c( "A", "{", "e", "2", "7", "o", "i", "y", "u")) dat$vok2.1 <- factor(dat$vok2.1, levels=c( "A", "{", "e", "2", "7", "o", "i", "y", "u")) # categorise vowels into height and frontedness categories dat$vok1.2 <- dat$vok1.1; levels(dat$vok1.2) <- c("low", "low", "mid", "mid", "mid", "mid", "high", "high", "high") dat$vok2.2 <- dat$vok2.1; levels(dat$vok2.2) <- c("low", "low", "mid", "mid", "mid", "mid", "high", "high", "high") dat$vok1.2.1 <- dat$vok1.1; levels(dat$vok1.2.1) <- c("back", "front", "front", "front", "back", "back", "front", "front", "back") dat$vok2.2.1 <- dat$vok2.1; levels(dat$vok2.2.1) <- c("back", "front", "front", "front", "back", "back", "front", "front", "back") # convert SAMPA to IPA symbols dat$vok1.3 <- dat$vok1.1; levels(dat$vok1.3) <- c("ɑ","æ","e","ø","ɤ","o","i","y","u") dat$vok2.3 <- dat$vok2.1; levels(dat$vok2.3) <- c("ɑ","æ","e","ø","ɤ","o","i","y","u") dat$vowel <- factor(dat$vowel, levels = c("AA","{{","ee", "22", "77", "oo", "ii", "yy", "uu", "Ae", "Ai", "Au", "{i", "eA", "ei", "7i", "7u")) dat$ftong <- factor(dat$ftong, levels=c("mono","dift")) cat(colnames(dat), sep = "\n") ##### THE variables in the data # # file -- file ID (anonymised) # phon -- the word transcription in SAMPA # morph -- morphological analysis (from Vabamorf) # ftong -- is the vowel monophthong or a diphthong (mono/dift) # quantity -- the quantity of the word (Q2/Q3) # vok1 -- phoneme label in the beginning of the vocalic sequence # vok2 -- phoneme label in the end of the sequence (in case of mono vok2 == vok1) # vok_start -- start time of the vocalic sequence # vok_stop -- end time of the vocalic sequence # vok_boundary -- boundary between vok1 and vok2 (if ftong == dift) # formant_ceiling -- the optimal ceiling for the formant analysis # f1_1 .. f1_30 -- F1 values from 30 equidistant points # f2_1 .. f2_30 -- F2 values # f3_1 .. f3_30 -- F3 values # vowel -- vowel label for the whole sequence # disp -- ? # SP -- speaker ID (anonymised) # gender -- gender (F/M) # vok1.1 -- first character of vok1 label (the phoneme without diacritics) # vok2.1 -- first char of vok2 # dift -- vok1.1 + vok2.1 # dur -- duration # dur2 -- ? # dift_piir # vok1.2 -- first component height (low/mid/high) # vok2.2 -- second component height # vok1.2.1 -- first component frontedness (front/back) # vok2.2.1 -- second component frontedness # vok1.3 -- IPA label of first component # vok2.3 -- IPA label of second component ### descriptive stats: number of tokens? 23.11.2018 for the ICPhS paper nrow(dat) # total of 5196 tokens length(unique(dat$file)) # 148 speakers, 81 female, 67 male length(unique(dat$SP)) # 119 unique speakers, some participated in more than one recording apply(table(dat$file, dat$ftong),2, summary) # each speaker produced a different number of tokens. on the average 24 monophthongs and 11 diphthongs per speaker #write.table(ftable(dat$gender,dat$quantity, dat$vowel), "tmp_ndift.txt", sep="\t", quote=F) ###### Calculate VSL trajectories library(emuR) f1_smooth <- bark(dat[,paste("f1",1:30,sep="_")]) # f1_smooth <- NULL; for(i in 1:nrow(dat)) f1_smooth<- rbind(f1_smooth, smooth(unlist(bark(dat[i,paste("f1",1:30,sep="_")])))) # f2_smooth <- bark(dat[,paste("f2",1:30,sep="_")]) # f2_smooth <- NULL; for(i in 1:nrow(dat)) f2_smooth<- rbind(f2_smooth, smooth(unlist(bark(dat[i,paste("f2",1:30,sep="_")])))) # #colnames(f1_smooth) <- paste("f1",1:30,sep="_"); colnames(f2_smooth) <- paste("f2",1:30,sep="_") VSL<-matrix(NA, nrow=nrow(dat), ncol=29) for (i in c(1:29)) VSL[,i] <-( (f1_smooth[,paste("f1",i,sep="_")]-f1_smooth[,paste("f1",i+1,sep="_")])^2 + (f2_smooth[,paste("f2",i,sep="_")]-f2_smooth[,paste("f2",i+1,sep="_")])^2 )^0.5 ### Get the VSL maxima dat$VSLmax <- apply(VSL, 1, which.max) #x2 <- 2/pi*asin(sqrt(x)) #x_mean <- apply(VSL, 1, mean) #x_max <- apply(VSL, 1, max) # TL == Trajectory Length dat$TL <- rowSums(VSL) ## Define the function for Euclidian distance euklid <- function(a,b) { if(length(a) != length(b)) stop("vektorid a ja b peavad olema sama pikad") sqrt(sum((a-b)^2)) } # calculate the formant trajectories as Euclidian distance of the measurement points from each end and then find the centre: # distance from the last point euk <- matrix(NA, ncol = 30, nrow=nrow(dat)) for (r in 1: nrow(dat)) {for(i in 1:30) {euk[r,i] <- euklid(bark(dat[r,paste0("f", 1:2, "_1")]), bark(dat[r,paste0("f", 1:2, "_", i)])) } } # distance from the first point euk2 <- matrix(NA, ncol = 30, nrow=nrow(dat)) for (r in 1: nrow(dat)) {for(i in 30:1) {euk2[r,i] <- euklid(bark(dat[r,paste0("f", 1:2, "_30")]), bark(dat[r,paste0("f", 1:2, "_", i)])) } } # z as the mid-point of the trajectory which is the least different from both ends euk3 <- euk-euk2 z <- apply(abs(euk3), 1, which.min) # # trajektoori keskpunkt proportsionaalselt diftongi piires dat$traj_kesk_prop <- z/50+0.2 hist(x[tmp]) shapiro.test(z[tmp]) qqline(z[tmp]) # testing the different measures of the boundary of the diphthong components library(lme4) lm1 <- lmer(log(dat$vok_boundary-dat$vok_start)~(1|file), data=dat, subset=ftong=="dift") lm2 <- lmer((z/50+0.2)~(1|file)+(1|vowel), data=dat, subset=ftong=="dift") lm3 <- lmer((z/50+0.2)~quantity+(1|file)+(1|vowel), data=dat, subset=ftong=="dift") lm4 <- lmer((z/50+0.2)~quantity+gender+(1|file)+(1|vowel), data=dat, subset=ftong=="dift") lm5 <- lmer((z/50+0.2)~quantity*vok1.2+(1|file)+(1|vowel), data=dat, subset=ftong=="dift") anova(lm2, lm3) anova(lm3) summary(lm3) boxplot(log(TL)~quantity*ftong, data=dat) ### ICPhS paper fig 2 png( file="icphs_dift_fig2.png", width = 1200,height = 500, res = 100) par(mfcol=c(1,2), ps=12, mar=c(5,5,2,0)+0.1) for (d in levels(dat$ftong)){ plot(1, type="n", ylim=c(-5,2), xlim=c(0, 0.155), axes=F, xlab="Time (ms)", ylab="", main=toupper(paste0(gsub("ft","",d), "phthongs"))) tmp1 <- apply(euk3[dat$ftong==d,], 2, tapply, dat$quantity[dat$ftong==d], mean) tmp2 <- tapply(dat$dur[dat$ftong==d], dat$quantity[dat$ftong==d], mean) for (i in 1:2){ if (d == "mono"){ lines(y= tmp1[i,], x= (0:29/29*(tmp2[i]*0.6)+0.2*tmp2[i]) , lty=i, lwd=2) } if (d == "dift"){ for(g in 1:29){lines(y= tmp1[i,c(g,g+1)], x=(c(g-1,g)/29*(tmp2[i]*0.6)+0.2*tmp2[i]), lty=i+1, lwd=2, col=rainbow(45, alpha = 0.9)[g])} } points(approx( x= tmp1[i,], y= (0:29/29*(tmp2[i]*0.6)+0.2*tmp2[i]), xout=0)$y, y=0, pch=16) } axis(side= 2, at=-2:2, las=1) mtext(side=2, at = 0, line=3, text = "Distance from mid-point (bark)", cex=.9) abline(h=-2.25) # VSL kontuurid tmp1 <- apply(VSL[dat$ftong==d,], 2, tapply, dat$quantity[dat$ftong==d], mean) for (i in 1:2) { if (d == "mono"){ lines(y= smooth(tmp1[i,]*15-4.5), x= (1:29/29*(tmp2[i]*0.6)+0.2*tmp2[i]) , lty=i, lwd=2) } if (d == "dift"){ for(g in 1:28){lines(y= smooth(tmp1[i,]*15-4.5)[c(g,g+1)], x=(c(g,g+1)/29*(tmp2[i]*0.6)+0.2*tmp2[i]), lty=i+1, lwd=2, col=rainbow(45, alpha = 0.9)[g])} } points( approx(y= smooth(tmp1[i,]*15-4.5), x= (1:29/29*(tmp2[i]*0.6)+0.2*tmp2[i]), xout = (mean(dat$VSLmax[dat$ftong==d & dat$quantity==paste0("Q",i+1)])/29 *(tmp2[i]*0.6)+0.2*tmp2[i])), pch=16) } axis(side=2, at=c(0.04,0.08,0.12)*15-4.5, labels=c(0.04,0.08,0.12), las=1 ) mtext(side=2, at = 0.085*15-4.5, line=3, text = "VSL (bark)", cex=0.9) abline(h=-4) if (d == "mono"){ for(i in 1:2) polygon(x=c(0,tmp2[i],tmp2[i],0,0), y= c(-4, -4, -4.25,-4.25, -4)-i*0.35, col="gray" ) } if (d == "dift"){ tmp3 <- tapply(dat$dift_piir[dat$ftong==d], dat$quantity[dat$ftong==d], mean) for(i in 1:2) {polygon(x=c(0,tmp3[i],tmp3[i],0,0), y= c(-4, -4, -4.25,-4.25, -4)-i*0.35, col=rainbow(45, alpha = 0.4)[1] ) polygon(x=c(tmp3[i],tmp2[i],tmp2[i],tmp3[i],tmp3[i]), y= c(-4, -4, -4.25,-4.25, -4)-i*0.35, col=rainbow(45, alpha = 0.4)[29] )} } axis(side=2, at=c(-4.475, -4.825), labels = c("Q2","Q3"), las=1, tick=F) box(); axis(side = 1, at=c(0:3)*5/100, labels=c(0:3)*50) } dev.off() ####### diftongid <- c("{i", "7i", "7u", "Ae", "Ai", "Au", "eA", "ei" ) tmp=dat$vowel %in% diftongid lm0 <- lmer(dift_piir/dur~ (1|file) , data=dat, subset=tmp) lm1 <- lmer(dift_piir/dur~vok1.2 + (1|file) , data=dat, subset=tmp) lm2 <- lmer(dift_piir/dur~vok1.2 + quantity + (1|file) , data=dat, subset=tmp) anova(lm1,lm2) lm3 <- lmer(dift_piir/dur~ vok1.2*quantity + (1|file) , data=dat, subset=tmp) anova(lm3,lm2) # the difference matrix of diphthong beginings and ends from the target monophthongs (calculated in groups of gender, values in bark) # !! This takes some time to compute !! alg_siht <- matrix(NA, ncol=9, nrow=nrow(dat)); colnames(alg_siht)<-levels(dat$vok1.1); rownames(alg_siht)<-dat$vowel for (i in 1:nrow(dat)){ for(j in levels(dat$vok1.1)){ alg_siht[i, j] <- euklid(bark(dat[i, paste0("f", 1:2, "_1")]), c(mean(apply(bark(dat[dat$vok1.1==j & dat$gender==dat$gender[i] &dat$ftong=="mono" , paste0("f1_", 1:30)]), 1, mean) ), mean(apply(bark(dat[dat$vok1.1==j & dat$gender==dat$gender[i] &dat$ftong=="mono" , paste0("f2_", 1:30)]), 1, mean) )) ) }} apply(alg_siht, 2, tapply, rownames(alg_siht), mean) lop_siht <- matrix(NA, ncol=9, nrow=nrow(dat)); colnames(lop_siht)<-levels(dat$vok2.1); rownames(lop_siht)<-dat$vowel for (i in 1:nrow(dat)){ for(j in levels(dat$vok2.1)){ lop_siht[i, j] <- euklid(bark(dat[i, paste0("f", 1:2, "_30")]), c(mean(apply(bark(dat[dat$vok2.1==j & dat$gender==dat$gender[i] &dat$ftong=="mono" , paste0("f1_", 1:30)]), 1, mean) ), mean(apply(bark(dat[dat$vok2.1==j & dat$gender==dat$gender[i] &dat$ftong=="mono" , paste0("f2_", 1:30)]), 1, mean) )) ) }} # Table 2 of the ICPHs2019 paper diftongid <- c("Ae", "Ai", "Au", "{i", "eA", "ei", "7i", "7u") diftongid2 <- c("ɑe", "ɑi", "ɑu", "æi", "eɑ", "ei", "ɤi", "ɤu") png(filename = paste0("icphs_dift_fig3.png"), width = 600, height = 300, res = 100) par(mfcol=c(1,2), mar=c(0,1.5,1.5,0.5)+0.1) for(f in 2:3){ if (f==2) tmp=t(apply(alg_siht, 2, tapply, rownames(alg_siht), mean)[diftongid,]) if (f==3) tmp=t(apply(lop_siht, 2, tapply, rownames(lop_siht), mean)[diftongid,]) plot(1, xlim=c(1-0.15,9.15), ylim=c(8.15,1-0.15), type="n", ylab="", xlab="", axes=F) for (i in 1:8){ for(j in which(round(tmp,1)[,i]==min(round(tmp,1)[,i]))) polygon(x=c(j-0.5,j-0.5,j+.5,j+.5), y=c(i-.5,i+.5,i+.5,i-.5), col="#FF000066") for(j in which(rownames(tmp)==substr(colnames(tmp),f-1,f-1)[i])) polygon(x=c(j-0.5,j-0.5,j+.5,j+.5), y=c(i-.5,i+.5,i+.5,i-.5), col="#0044FF66") } text(x=rep(1:9), y=rep(1:8, each=9), labels= round(tmp,1)) box() axis(side=3, at=1:9, labels = levels(dat$vok1.3), tick = F, line=-0.5) axis(side=2, at=1:8, labels = diftongid2, las=1, tick = F, line=-0.5) } dev.off() # FP slaididele artikli tab2 png(filename = paste0("FP_dift_fig3.png"), width = 1200, height = 500, res = 100) par(mfcol=c(1,2), mar=c(0,1.5,4,6)+0.1, xpd=F) for(f in 2:3){ if (f==2) tmp=t(apply(alg_siht, 2, tapply, rownames(alg_siht), mean)[diftongid,]) if (f==3) tmp=t(apply(lop_siht, 2, tapply, rownames(lop_siht), mean)[diftongid,]) plot(1, xlim=c(1-0.15,9.15), ylim=c(8.15,1-0.15), type="n", ylab="", xlab="", axes=F, main=paste("Component", f-1)) for (i in 1:8){ for(j in which(round(tmp,1)[,i]==min(round(tmp,1)[,i]))) polygon(x=c(j-0.5,j-0.5,j+.5,j+.5), y=c(i-.5,i+.5,i+.5,i-.5), col="#FF000066") for(j in which(rownames(tmp)==substr(colnames(tmp),f-1,f-1)[i])) polygon(x=c(j-0.5,j-0.5,j+.5,j+.5), y=c(i-.5,i+.5,i+.5,i-.5), col="#0044FF66") } text(x=rep(1:9), y=rep(1:8, each=9), labels= round(tmp,1)) box() axis(side=3, at=1:9, labels = levels(dat$vok1.3), tick = F, line=-0.5) axis(side=2, at=1:8, labels = diftongid2, las=1, tick = F, line=-0.5) if (f ==3) { par(xpd=T) legend(x=9.5, y=0.2, fill=c("#0044FF66", "#FF000066","#AA81CE" ), legend = c("expected", "closest", "MATCH"), bty="n") } } dev.off() lm2 <- lmer(alg_siht_target~vowel+(1|file), data=dat, subset=ftong=="dift") anova(lm1,lm2) summary(lm2) ### ICPHs paper figure 1: diphthong formants trajectories in F1-F2 space png( file="icphs_dift_fig1.png", width = 1200,height = 800, res = 100) par(mfrow=c(2,2), ps=12, mar=c(5,5,2,0)+0.1) for(i in levels(dat$gender)){ for(q in levels(dat$quantity)){ tmp = dat$ftong=="mono" & dat$gender == i & dat$quantity==q tmp1 <- tapply(apply(bark(dat[tmp,paste0("f1_",1:30)]),1, mean), dat$vok1.3[tmp], mean) tmp2 <- tapply(apply(bark(dat[tmp,paste0("f2_",1:30)]),1, mean), dat$vok1.3[tmp], mean) plot(y=tmp1, x=tmp2, xlim=quantile(tmp2, probs = c(1,0))+c(0.1,-0.1), ylim=quantile(tmp1, probs = c(1,0))+c(0.25,-0.1), pch =16, col="gray", main=paste(i,q), xlab="F2 (bark)", ylab="F1 (bark)", cex=4, las=1) grid() for (v in levels(dat$vok1.3)) lines(y=colMeans(bark(dat[tmp&dat$vok1.3==v,paste0("f1_",1:30)])), x=colMeans(bark(dat[tmp&dat$vok1.3==v,paste0("f2_",1:30)])) , col="gray", lwd=3) text(y=tmp1, x=tmp2, xlim=quantile(tmp2, probs = c(1,0)), ylim=quantile(tmp1, probs = c(1,0)), labels =names(tmp1)) for (v in diftongid){ tmp = dat$ftong=="dift" & dat$gender == i & dat$quantity==q & dat$vowel == v for(g in 1:29){lines(y=colMeans(bark(dat[tmp,paste0("f1_",c(g,g+1))])), x=colMeans(bark(dat[tmp,paste0("f2_",c(g,g+1))])),type="l", lwd=3, col=rainbow(45, alpha = 0.9)[g])} points(y=mean(bark(dat[tmp,"f1_1"])), x=mean(bark(dat[tmp,"f2_1"])), pch=16, cex=4, col=rainbow(45, alpha = 0.4)[1]) text(y=mean(bark(dat[tmp,"f1_1"])), x=mean(bark(dat[tmp,"f2_1"])), labels = unique(dat$vok1.3[tmp]), col="black") points(y=mean(bark(dat[tmp,"f1_30"])), x=mean(bark(dat[tmp,"f2_30"])), pch=16, cex=4, col=rainbow(45, alpha = 0.4)[29]) text(y=mean(bark(dat[tmp,"f1_30"])), x=mean(bark(dat[tmp,"f2_30"])), labels = unique(dat$vok2.3[tmp]), col="black") } }} dev.off() # End of the code