######################################################################### ### Constructional variation in Estonian: ### ### demonstrative pronouns and adverbs as determiners in noun phrases ### ######################################################################### # R code for the study published in Taremaa et al. 2021; https://doi.org/10.1016/j.lingua.2021.103030 library(party) library(lattice) library(sjPlot) library(dplyr) library(Hmisc) library(FactoMineR) library(factoextra) library(ggplot2) # Data CVE = read.delim("CVE.txt", sep = "\t", header = T) CVE <- CVE %>% mutate_if(is.character,as.factor) summary(CVE) str(CVE) nrow(CVE) head(CVE, 3) colnames(CVE) ### Exploratory analysis ## Figure 1 (MCA) cats = apply(CVE, 2, function(x) nlevels(as.factor(x))) mca1 = MCA(CVE, graph = FALSE) mca1_vars_df = data.frame(mca1$var$coord, Variable = rep(names(cats), cats)) mca1_obs_df = data.frame(mca1$ind$coord) fviz_contrib(mca1, choice = "var", axes = 1, top = 15) fviz_contrib(mca1, choice = "var", axes = 2, top = 15) # plot ggplot(data=mca1_vars_df, aes(x = Dim.1, y = Dim.2, label = rownames(mca1_vars_df))) + geom_hline(yintercept = 0, colour = "gray30") + geom_vline(xintercept = 0, colour = "gray30") + geom_text(data = mca1_vars_df, aes(x = Dim.1, y = Dim.2, label = rownames(mca1_vars_df), colour = Variable)) + ggtitle("") + scale_color_manual(values=c("black", "red4", "red", "grey2", "cyan3", "darkgreen", "blue", "navyblue", "darkslategray", "magenta", "orange", "coral3", "grey60", "maroon", "steelblue4","yellow")) ### Data modelling ## Figure 2 (cforest). Construction ~ . set.seed(82) mycontrols <- cforest_unbiased(ntree=500, mtry=3) CVE.cforest1 = cforest(Construction ~ NounAnimacy + NounConcreteness + NounMobility + NounSize + NounTemporality + NounLemmaLength + NounFrequency + MotionVerb + VerbPlacement + TextType + SubCorpus, data = CVE, controls=mycontrols) (CVE.cforest1.varimp = varimp(CVE.cforest1, conditional = TRUE)) # plot dotplot(sort(CVE.cforest1.varimp), xlab = "Construction~.", panel = function(x,y){ panel.dotplot(x, y, pch=16, cex=1.1, col="black") panel.abline(v=abs(min(CVE.cforest1.varimp))) } ) # model diagnostics CVE.cforest1_pred <- unlist(treeresponse(CVE.cforest1))[c(FALSE,TRUE)] somers2(CVE.cforest1_pred, as.numeric(CVE$Construction) - 1) ## Table 3. NounConcreteness library(sjPlot) sjt.xtab(CVE$Construction, CVE$NounConcreteness, show.row.prc = T) ## Figure 3 (ctree). Construction ~ . CVE.ctree1 = ctree(Construction ~ NounConcreteness + TextType + NounAnimacy + NounTemporality, controls = ctree_control(maxdepth = 3, minbucket = 20), data = CVE) plot(CVE.ctree1) # model diagnostics preds.tree = predict(CVE.ctree1) tab = table(preds.tree, CVE$Construction) sum(diag(tab))/sum(tab) ## Figure 4 (cforest). DemDistance ~ . CVE.cforest2 = cforest(DemDistance ~ NounAnimacy + NounConcreteness + NounMobility + NounSize + NounTemporality + NounLemmaLength + NounFrequency + MotionVerb + VerbPlacement + TextType + SubCorpus, data = CVE, controls=mycontrols) (CVE.cforest2.varimp = varimp(CVE.cforest2, conditional = TRUE)) # plot dotplot(sort(CVE.cforest2.varimp), xlab = "DemDistance~.", panel = function(x,y){ panel.dotplot(x, y, pch=16, cex=1.1, col="black") panel.abline(v=abs(min(CVE.cforest2.varimp))) } ) # model diagnostics CVE.cforest2_pred <- unlist(treeresponse(CVE.cforest2))[c(FALSE,TRUE)] somers2(CVE.cforest2_pred, as.numeric(CVE$DemDistance) - 1) ## Figure 5 (ctree). DemDistance ~ . CVE.ctree2 = ctree(DemDistance ~ TextType + NounTemporality, controls = ctree_control(maxdepth = 3, minbucket = 20), data = CVE) plot(CVE.ctree2) # model diagnostics preds.tree = predict(CVE.ctree2) tab = table(preds.tree, CVE$DemDistance) sum(diag(tab))/sum(tab) ## Supplementary materials. DemSpatRel ~ CVE.cforest3 = cforest(DemSpatRel ~ NounAnimacy + NounSize + NounConcreteness + NounMobility + NounTemporality + NounLemmaLength + NounFrequency + MotionVerb + VerbPlacement + TextType + SubCorpus, data = CVE, controls=mycontrols) (CVE.cforest3.varimp = varimp(CVE.cforest3, conditional = TRUE)) dotplot(sort(CVE.cforest3.varimp), xlab = "DemSpatRel~.", panel = function(x,y){ panel.dotplot(x, y, pch=16, cex=1.1, col="black") panel.abline(v=abs(min(CVE.cforest3.varimp))) } ) CVE.ctree3 = ctree(DemSpatRel ~ NounAnimacy + NounSize + NounConcreteness + NounMobility + NounTemporality + NounLemmaLength + NounFrequency + MotionVerb + VerbPlacement + TextType + SubCorpus, controls = ctree_control(maxdepth = 3, minbucket = 20), data = CVE) plot(CVE.ctree3) preds.tree = predict(CVE.ctree3) tab = table(preds.tree, CVE$DemSpatRel) sum(diag(tab))/sum(tab) ## Supplementary materials. DemCase ~. CVE.cforest4 = cforest(DemCase ~ NounAnimacy + NounSize + NounConcreteness + NounMobility + NounTemporality + NounLemmaLength + NounFrequency + MotionVerb + VerbPlacement + TextType + SubCorpus, data = CVE, controls=mycontrols) (CVE.cforest4.varimp = varimp(CVE.cforest4, conditional = TRUE)) dotplot(sort(CVE.cforest4.varimp), xlab = "DemCase~.", panel = function(x,y){ panel.dotplot(x, y, pch=16, cex=1.1, col = "black") panel.abline(v=abs(min(CVE.cforest4.varimp))) } ) CVE.cforest4_pred <- unlist(treeresponse(CVE.cforest4))[c(FALSE,TRUE)] somers2(CVE.cforest4_pred, as.numeric(CVE$DemCase) - 1) CVE.ctree4 = ctree(DemCase ~ NounAnimacy + NounSize + NounConcreteness + NounMobility + NounTemporality + NounLemmaLength + NounFrequency + MotionVerb + VerbPlacement + TextType + SubCorpus, controls = ctree_control(maxdepth = 3, minbucket = 20), data = CVE) plot(CVE.ctree4) preds.tree = predict(CVE.ctree4) tab = table(preds.tree, CVE$DemSpatRel) sum(diag(tab))/sum(tab)