#######################################################################################################
### R code for the paper "Speed and space: semantic asymmetries in motion descriptions in Estonian" ###
#######################################################################################################

# Authors: Piia Taremaa and Anetta Kopecka
# Journal: Cognitive Linguistics
# Status of the paper: accepted


################
### PACKAGES ###
################

rm(list=ls(all=TRUE))

library(dplyr)
library(Rcmdr)
library(sjPlot)
library(psych)
library(lsr)
library(reshape2)
library(ggplot2)
library(party)
library(factoextra)
library(FactoMineR)


############
### DATA ###
############

data = read.delim("Taremaa&Kopecka_Speed and space_CL10.11.2022.txt", header = T, encoding = "UTF-8")
data <- data %>% mutate_if(is.character,as.factor)

data$VerbSpeedCat <- with(data, binVariable(VerbSpeed, bins=3, 
                                            method='natural', labels=c('slowVerb','mediumVerb','fastVerb')))
data$VerbType = factor(data$VerbType, levels = c("MannerVerb", "SourceVerb", "GoalVerb"))
data$VerbSpeedCat = factor(data$VerbSpeedCat, levels = c("slowVerb", "mediumVerb", "fastVerb"))
data$VerbCat = factor(data$VerbCat, levels = c("mediumSV", "mediumGV", "slowMV", "mediumMV", "fastMV"))

head(data)
colnames(data)
nrow(data)
str(data)


##############
### TABLES ###
##############

# Table 1: Coding schema
str(data)
colnames(data)

# Table 2: Motion verbs of the study categorised based on VerbType and VerbSpeed.
sjt.xtab(data$Verb, data$VerbSpeedCat) 

# Table 3: The distribution and forms of the semantic units in the corpus data of 12,300 clauses.
# For N of occurrences and main morphosyntactic devices
summary(data)
# Unique morphosyntactic devices (not presented in the paper)
unique(data$SourceForm)
unique(data$LocationForm)
unique(data$TrajectoryForm)
unique(data$DirectionForm)
unique(data$GoalForm)
unique(data$DistanceForm)
unique(data$TimeForm)
unique(data$PurposeForm)
unique(data$ResultForm)
unique(data$MannerFormLongOrdered)
unique(data$SpeedFormLong)
# For length in words (library(psych); not presented in the paper)
describe(data$SourceLength)
describe(data$LocationLength)
describe(data$TrajectoryLength)
describe(data$DirectionLength)
describe(data$GoalLength)
describe(data$DistanceLength)
describe(data$PurposeLength)
describe(data$ResultLength)
describe(data$TimeLength)
describe(data$MannerLength)

# Table 4: The distribution of different types of movers in the data.
summary(data$MoverAnimacy)

# Table 5: The presence or absence of spatial expressions in clauses that contain Speed modifiers (Speed+) and in those that do not (Speed-)
dataSP = data
dataSP$SlowOrFast = recode(dataSP$SlowOrFast, "'slowModif'='speedplus'")
dataSP$SlowOrFast = recode(dataSP$SlowOrFast, "'fastModif'='speedplus'")
dataSP$SlowOrFast = recode(dataSP$SlowOrFast, "'variableModif'='speedplus'")
dataSP$SlowOrFast = recode(dataSP$SlowOrFast, "'noSpeedModif'='speedminus'")
dataSP$SlowOrFast = factor(dataSP$SlowOrFast)
table(dataSP$SlowOrFast)
tab_xtab(dataSP$SlowOrFast, dataSP$SpatialExprPresence, show.row.prc = T,
         show.summary = T)
chisq.test(dataSP$SlowOrFast, dataSP$SpatialExprPresence, correct = F) # without continuity correction
cramersV( dataSP$SlowOrFast, dataSP$SpatialExprPresence )

# Table 6: The distribution of speed modifiers across spatial expressions
summary(data$SlowOrFast)
dataS = filter(data, SlowOrFast != "noSpeedModif", SlowOrFast != "variableModif")
dataS$SlowOrFast = factor(dataS$SlowOrFast)
summary(dataS$SlowOrFast)
pro2 = select(dataS, SlowOrFast, Source, Location, Trajectory, Direction, Goal, Distance)
nrow(pro2)
propikk = melt(pro2, id.vars="SlowOrFast")
head(propikk)
nrow(propikk)
propik = filter(propikk, value == "yes")
nrow(propik)
colnames(propik)

tab_xtab(propik$SlowOrFast, propik$variable, show.row.prc = T,
         show.summary = T)
chisq.test(propik$SlowOrFast, propik$variable) 
cramersV( propik$SlowOrFast, propik$variable ) # 0.2539014


###############
### FIGURES ###
###############

# Figure 1: Multiple correspondence analysis. Colours indicate VerbType. 
data2 = select(data, Source, Location, Trajectory, Direction, Goal, Time, Purpose, MoverAnimacy,   Verb,   
               Manner,  Result, Distance, SlowOrFast)
cats = apply(data2, 2, function(x) nlevels(as.factor(x)))
mca1 = MCA(data2, graph = FALSE)      
mca1_vars_df = data.frame(mca1$var$coord, Variable = rep(names(cats), cats))
mca1_obs_df = data.frame(mca1$ind$coord)
fviz_contrib(mca1, choice = "var", axes = 1, top = 10)
fviz_contrib(mca1, choice = "var", axes = 2, top = 10)
fviz_contrib(mca1, choice = "var", axes = 1:2, top = 20)
ind = get_mca_ind(mca1)
# VerbType
fviz_mca_biplot(mca1, 
                geom.var = "text", geom.ind = "point",
                repel = T, col.var = "black", 
                label = "all",
                invisible = "none",
                alpha.ind = 0.3, 
                select.ind = list(contrib = 12300),
                select.var = list(contrib = 50),
                habillage  = data$VerbType,
                palette = c("gold", "blue", "red"),
                ggtheme = theme_minimal(),
                title ="")

# Figure 2: Multiple correspondence analysis. Colours indicate VerbSpeed. 
# SpeedCat
fviz_mca_biplot(mca1, 
                geom.var = "text", geom.ind = "point",
                repel = T, col.var = "black", 
                label = "all",
                invisible = "none",
                alpha.ind = 0.3, 
                select.ind = list(contrib = 12300),
                select.var = list(contrib = 50),
                habillage  = data$VerbSpeedCat,
                palette = c("orange",  "grey", "darkgreen"),
                ggtheme = theme_minimal(),
                title ="")

# Figure 3: Conditional inference tree for VerbSpeed
data$MoverAnimacy <- recode(data$MoverAnimacy, "'animate'='an'")
data$MoverAnimacy <- recode(data$MoverAnimacy, "'inanimate'='inan'")
data$MoverAnimacy <- recode(data$MoverAnimacy, "'vehicle'='veh'")
data$MoverAnimacy <- recode(data$MoverAnimacy, "'unclear'='un'")
data$SlowOrFast <- recode(data$SlowOrFast, "'slowModif'='slowM'")
data$SlowOrFast <- recode(data$SlowOrFast, "'fastModif'='fastM'")
data$SlowOrFast <- recode(data$SlowOrFast, "'noSpeedModif'='noM'")
data$SlowOrFast <- recode(data$SlowOrFast, "'variableModif'='varM'")
#
ctree1 = ctree(VerbSpeed ~ Source + Location + Trajectory + Direction + Goal + Distance + Result + Time + Purpose + MoverAnimacy + Manner + SlowOrFast, 
               data = data, controls = ctree_control(maxdepth = 5, minbucket = 50))
plot(ctree1)
# Accuracy
predicted1 <- predict(ctree1)
head(predicted1)
actual1 <- data$VerbSpeed
head(actual1)
cor(predicted1, actual1)

# Figure 4: Conditional inference tree for VerbCat 
data$VerbCat <- recode(data$VerbCat, "'slowMV'='sMV'")
data$VerbCat <- recode(data$VerbCat, "'fastMV'='fMV'")
data$VerbCat <- recode(data$VerbCat, "'mediumMV'='mMV'")
data$VerbCat <- recode(data$VerbCat, "'mediumGV'='mGV'")
data$VerbCat <- recode(data$VerbCat, "'mediumSV'='mSV'")
data$VerbCat = factor(data$VerbCat, levels = c("mSV", "mGV", "sMV", "mMV", "fMV"))
#
ctree2 = ctree(VerbCat ~ Source + Location + Trajectory + Direction + Goal + Purpose + Distance + Result + Time + Manner + SlowOrFast + MoverAnimacy,
               data = data , controls = ctree_control(maxdepth = 3, minbucket = 50))
plot(ctree2)  
# Accuracy
predicted2 <- predict(ctree2)
head(predicted2)
actual2 <- data$VerbCat
head(actual2)
(t <- table(actual2, predicted2))
(accuracy <- sum(diag(t))/sum(t)) 
prop.table(table(data$VerbCat))