Introduction

This is an R Markdown script showing how I analyzed the phonological distances between Eurasian lects based on Phonotacticon 1.0 for my thesis ``Phonological areas in Eurasia’’ (2024).

The data

First, I set up the environment of R Markdown.

options(scipen = 100, digits = 3)

Then I load the required R packages.

library(data.table)
library(tidytable)
library(stringr)
library(stringi)
library(geosphere)
library(plotly)
library(geodata)
library(plyr)
library(tibble)
library(forcats)
library(purrr)
library(Rfast)
library(e1071)
library(caret)
library(dplyr)
library(anocva)
library(profmem)
library(stargazer)
library(lme4)
library(viridis)
library(spdep)
library(spatialreg)
library(xtable)
library(ggforce)
library(magrittr)
library(vegan)

I load Phonotacticon.

Phonotacticon <- read.csv("Phonotacticon1_0.csv") %>% 
  as.data.table()

Phonotacticon

All the subset sample lects are listed below.

Eurasia <- Phonotacticon %>% 
  .[Onset != '?' & 
    Nucleus != '?' &
    Coda != '?' &
    !grepl('C{2,}', Onset) &
    !grepl('C{2,}', Coda) &
    !grepl('\\[.{10}.*?\\]\\[.{10}.*?\\]', Onset) &
    !grepl('\\[.{10}.*?\\]\\[.{10}.*?\\]', Coda)] %>% 
  .[, .(Lect, Phoneme, Tone, Onset, Nucleus, Coda)]

Eurasia$Lect

##   [1] "A'ou"                              "Akajeru"                          
##   [3] "Amdo Tibetan"                      "Angami Naga"                      
##   [5] "Ao Naga"                           "Archi"                            
##   [7] "Aromanian"                         "Arpitan"                          
##   [9] "Arvanitika Albanian"               "Asho Chin"                        
##  [11] "Assamese"                          "Asturian-Leonese-Cantabrian"      
##  [13] "Atong (India)"                     "Avar"                             
##  [15] "Baba Malay"                        "Badaga"                           
##  [17] "Bagvalal"                          "Bantawa"                          
##  [19] "Basque"                            "Betta Kurumba"                    
##  [21] "Bezhta"                            "Bih"                              
##  [23] "Bisu"                              "Biyo"                             
##  [25] "Bodo-Mech"                         "Bolyu"                            
##  [27] "Bonan"                             "Budukh"                           
##  [29] "Bugan"                             "Bujhyal"                          
##  [31] "Bulo Stieng"                       "Bunan"                            
##  [33] "Burmese"                           "Burushaski"                       
##  [35] "Cao Miao"                          "Catalan"                          
##  [37] "Central Bai"                       "Central Chong"                    
##  [39] "Central Hongshuihe Zhuang"         "Central Khmer"                    
##  [41] "Chak"                              "Chintang"                         
##  [43] "Chitwania Tharu"                   "Chong of Chanthaburi"             
##  [45] "Chothe"                            "Chukchi"                          
##  [47] "Chut"                              "Chuvash"                          
##  [49] "Cosao"                             "Cypriot Arabic"                   
##  [51] "Daai Chin"                         "Dagur"                            
##  [53] "Daman-Diu Portuguese"              "Dandami Maria"                    
##  [55] "Danish"                            "Daohua"                           
##  [57] "Dari"                              "Darma"                            
##  [59] "Deori"                             "Dhimal"                           
##  [61] "Dhivehi"                           "Domari"                           
##  [63] "Dongxiang"                         "Duhumbi"                          
##  [65] "Dumi"                              "Dungan"                           
##  [67] "Duoluo Gelao"                      "Dutch"                            
##  [69] "Dzongkha"                          "E"                                
##  [71] "Eastern Katu"                      "Eastern Kayah"                    
##  [73] "Eastern Magar"                     "Eastern Newari"                   
##  [75] "Eastern Panjabi"                   "Eastern Tamang"                   
##  [77] "Enu"                               "Ersu"                             
##  [79] "Estonian Swedish"                  "Evenki"                           
##  [81] "Forest Enets"                      "French"                           
##  [83] "Friulian"                          "Galo"                             
##  [85] "Gan Chinese"                       "Gata'"                            
##  [87] "Georgian"                          "German"                           
##  [89] "Gheg Albanian"                     "Gilaki"                           
##  [91] "Godoberi"                          "Godwari"                          
##  [93] "Gujarati"                          "Gurani"                           
##  [95] "Hakka Chinese"                     "Halbi"                            
##  [97] "Halh Mongolian"                    "Hills Karbi"                      
##  [99] "Hindi"                             "Hinuq"                            
## [101] "Hmong Njua"                        "Hokkaido Ainu"                    
## [103] "Honi"                              "Hui Chinese"                      
## [105] "Hungarian"                         "Icelandic"                        
## [107] "Ingrian"                           "Irula of the Nilgiri"             
## [109] "Italian"                           "Iu Mien"                          
## [111] "Japanese"                          "Japhug"                           
## [113] "Jarawa (India)"                    "Jejueo"                           
## [115] "Jennu Kurumba"                     "Jerung"                           
## [117] "Jinyu Chinese"                     "Jiongnai Bunu"                    
## [119] "Kabardian"                         "Kadar"                            
## [121] "Kado"                              "Kaduo"                            
## [123] "Kashmiri"                          "Kathmandu Valley Newari"          
## [125] "Katso"                             "Kayan Lahwi"                      
## [127] "Kazakh"                            "Kelantan-Pattani Malay"           
## [129] "Ket"                               "Khams Tibetan"                    
## [131] "Khasi"                             "Khezha Naga"                      
## [133] "Khinalug"                          "Khmu"                             
## [135] "Kirghiz"                           "Kirmanjki"                        
## [137] "Kman"                              "Kodava"                           
## [139] "Koi"                               "Koireng"                          
## [141] "Komi-Zyrian"                       "Konda-Dora"                       
## [143] "Konkan Marathi"                    "Korean"                           
## [145] "Korku"                             "Korra Koraga"                     
## [147] "Koryak"                            "Kotia-Adivasi Oriya-Desiya"       
## [149] "Kucong"                            "Kui (India)"                      
## [151] "Kumaoni"                           "Kumarbhag Paharia"                
## [153] "Kumyk"                             "Kurtokha"                         
## [155] "Kuy"                               "Kyerung"                          
## [157] "Lachi"                             "Ladino"                           
## [159] "Lahu"                              "Lak"                              
## [161] "Lakkia"                            "Lambadi"                          
## [163] "Lamjung-Melamchi Yolmo"            "Lao"                              
## [165] "Lashi"                             "Laven"                            
## [167] "Laz"                               "Leh Ladakhi"                      
## [169] "Lepcha"                            "Lhomi"                            
## [171] "Liangmai Naga"                     "Limbu"                            
## [173] "Lisu"                              "Longchuan Achang"                 
## [175] "Macedonian"                        "Maithili"                         
## [177] "Malacca-Batavia Portuguese Creole" "Malavedan"                        
## [179] "Malayalam"                         "Manchu"                           
## [181] "Mandarin Chinese"                  "Mang"                             
## [183] "Mangghuer"                         "Manipuri"                         
## [185] "Mao Naga"                          "Maonan"                           
## [187] "Maram Naga"                        "Marathi"                          
## [189] "Marwari (India)"                   "Mewati"                           
## [191] "Milang"                            "Min Bei Chinese"                  
## [193] "Min Nan Chinese"                   "Miyako"                           
## [195] "Mlabri"                            "Modern Greek"                     
## [197] "Moken"                             "Mon"                              
## [199] "Mongghul"                          "Moyon"                            
## [201] "Muduga"                            "Mulam"                            
## [203] "Mundari"                           "Nanai"                            
## [205] "Narua"                             "Naukan Yupik"                     
## [207] "Negidal"                           "Neo-Mandaic"                      
## [209] "Nepali"                            "Nganasan"                         
## [211] "Nihali"                            "Nimadi"                           
## [213] "Nocte Naga"                        "North Azerbaijani"                
## [215] "North-Central Dargwa"              "Northeastern Thai"                
## [217] "Northern Jinghpaw"                 "Northern Pashto"                  
## [219] "Northern Pinghua"                  "Northern Pumi"                    
## [221] "Northern Thai"                     "Northern Yukaghir"                
## [223] "Northwestern Kolami"               "Nung (Myanmar)"                   
## [225] "Nuristani Kalasha"                 "Nyahkur"                          
## [227] "Odia"                              "Oki-No-Erabu"                     
## [229] "Oroch"                             "Ostfränkisch"                     
## [231] "Pa-Hng"                            "Pacoh"                            
## [233] "Paite Chin"                        "Pela"                             
## [235] "Peripheral Mongolian"              "Phom Naga"                        
## [237] "Piemontese"                        "Pite Saami"                       
## [239] "Pnar"                              "Pontic"                           
## [241] "Portuguese"                        "Pu-Xian Chinese"                  
## [243] "Purik-Sham-Nubra"                  "Pwo Eastern Karen"                
## [245] "Rabha"                             "Rajbanshi"                        
## [247] "Ravula"                            "Russia Buriat"                    
## [249] "Russian"                           "Rutul"                            
## [251] "Sadri"                             "Sadu"                             
## [253] "Sakha"                             "Sangkong"                         
## [255] "Sani"                              "Santali"                          
## [257] "Saurashtra"                        "Sedang"                           
## [259] "Selkup"                            "Semelai"                          
## [261] "Shixing"                           "Sholaga"                          
## [263] "Sichuan Yi"                        "Sikkimese"                        
## [265] "Sindhi"                            "Sinhala"                          
## [267] "Situ"                              "Solu-Khumbu Sherpa"               
## [269] "Sora"                              "South Azerbaijani"                
## [271] "South Wa"                          "Southeast Pashayi"                
## [273] "Southern Altai"                    "Southern Amami-Oshima"            
## [275] "Southern Jinghpaw"                 "Southern Pashto"                  
## [277] "Southern Pumi"                     "Southern Qiang"                   
## [279] "Southern Rengma Naga"              "Southern Yukaghir"                
## [281] "Southwestern Dargwa"               "Sri Lanka Malay"                  
## [283] "Standard Malay"                    "Stau-Dgebshes"                    
## [285] "Sui"                               "Sunwar"                           
## [287] "Tai Do-Mene-Yo"                    "Tamil"                            
## [289] "Tangam"                            "Tatar"                            
## [291] "Thado Chin"                        "Thai"                             
## [293] "Thakali"                           "Thangmi"                          
## [295] "Thulung"                           "Tibetan"                          
## [297] "Toda"                              "Tsat"                             
## [299] "Tsez"                              "Tshangla"                         
## [301] "Tulu"                              "Tundra Nenets"                    
## [303] "Tuvinian"                          "Udihe"                            
## [305] "Uighur"                            "Vaagri Booli"                     
## [307] "Vach-Vasjugan"                     "Varhadi-Nagpuri"                  
## [309] "Vietnamese"                        "Waddar"                           
## [311] "Wambule"                           "Wayu"                             
## [313] "Welsh"                             "West Yugur"                       
## [315] "Western Armenian"                  "Western Magar"                    
## [317] "Western Muya"                      "Western Ong-Be"                   
## [319] "Western Parbate Kham"              "Western Puroik"                   
## [321] "Western Tamang"                    "Western Xiangxi Miao"             
## [323] "Westphalic"                        "Wu Chinese"                       
## [325] "Wuding-Luquan Yi"                  "Wutunhua"                         
## [327] "Yakkha"                            "Yerong-Southern Buyang"           
## [329] "Yongbei Zhuang"                    "Youle Jinuo"                      
## [331] "Yue Chinese"                       "Zaiwa"                            
## [333] "Zauzou"                            "Zbu"                              
## [335] "Zeme Naga"

I make a list of lects and their geographical coordinates.

Lect_LonLat <- Phonotacticon %>%
  .[Lect %in% Eurasia$Lect] %>%
  .[, .(Lect, lon, lat)]
  
Lect_LonLat

For visualizations, I prepare a map of Eurasia. First, I load map data.

map <- map_data("world")

head(map)

Then I create a map of Eurasia.

EurasiaMap <- ggplot(map, aes(x = long, y = lat)) +
  geom_polygon(aes(group = group),
               fill = "white",
               color = "darkgrey",
               size = 0.2) +
  coord_map("ortho",
            orientation = c(20, 70, 0),
            xlim = c(10, 130),
            ylim = c(0, 90)) +
  theme_void()

EurasiaMap

Below shows the first ten segments and the first ten featural values of a modified version of PanPhon.

PanPhon <- fread("PanPhonPhonotacticon1_0.csv") %>%
  as.data.table() %>%
  unique(by = 'ipa')

PanPhon

Check if all phonemic transcriptions are present in PanPhon:

Transcriptions <- Eurasia$Phoneme %>% 
  str_split_fixed(pattern = ' ', n = Inf) %>% 
  as.data.table() %>% 
  melt(measure.vars = colnames(.)) %>% 
  select(-variable) %>% 
  filter(value != '') %>% 
  distinct() %>% 
  mutate(Correct = value %in% PanPhon$ipa)

all(Transcriptions$Correct)

## [1] TRUE

Define a function making a booktabs code:

booktabs <- function(x, y) {
  addtorow <- list()
  addtorow$pos <- list(-1, 0, nrow(x))
  addtorow$command <- c('\\toprule ', '\\midrule ', '\\bottomrule ')
  print(x, 
        file = y, 
        include.rownames = FALSE, 
        add.to.row = addtorow,
        hline.after = NULL)
}

Analyzing the sequences of each lect

In this section, I will analyze the sequences of each lect.

I arrange PanPhon segments in alphabetical order.

PanPhonOrder <- PanPhon$ipa[
                  order(-nchar(PanPhon$ipa),
                      PanPhon$ipa)]

head(PanPhonOrder, 10)

##  [1] "h͡d̪͡ɮ̪ʲʷ⁺" "h͡d̪͡ɮ̪ʷː⁺" "h͡d̪͡ɮ̪ʷˠ⁺" "h͡d̪͡ɮ̪ʷˤ⁺" "h͡d̪͡z̪ʲʷ⁺" "h͡d̪͡z̪ʷː⁺" "h͡d̪͡z̪ʷˠ⁺" "h͡d̪͡z̪ʷˤ⁺"
##  [9] "h͡t̪͡ɬ̪ʲʷ⁺" "h͡t̪͡ɬ̪ʷː⁺"

I create a regex line of PanPhon in order to split the segments from sequences.

PanPhonRegex <- paste0("(?:",
                       paste(PanPhonOrder, collapse="|"),
                      '|B|C|Č|F|G|Ł|L|N|O|P|R|S|T|V|W|X|Z',
                       ")")

str_trunc(PanPhonRegex, 100)

## [1] "(?:h͡d̪͡ɮ̪ʲʷ⁺|h͡d̪͡ɮ̪ʷː⁺|h͡d̪͡ɮ̪ʷˠ⁺|h͡d̪͡ɮ̪ʷˤ⁺|h͡d̪͡z̪ʲʷ⁺|h͡d̪͡z̪ʷː⁺|h͡d̪͡z̪ʷˠ⁺|h͡d̪͡z̪ʷˤ⁺|h͡t̪͡ɬ..."

I also create PanPhon regex including brackets, in order to detect segments within brackets (e. g. [ptk] meaning “/p/, /t/, or /k/”.)

PanPhonRegexBrackets <- paste0('(?:',
                               '(?<=\\[).*?(?=\\])|',
                               paste(PanPhonOrder, collapse="|"),
                               '|B|C|Č|F|G|Ł|L|N|O|P|R|S|T|V|W|X|Z',
                               ')')

str_trunc(PanPhonRegexBrackets, 100)

## [1] "(?:(?<=\\[).*?(?=\\])|h͡d̪͡ɮ̪ʲʷ⁺|h͡d̪͡ɮ̪ʷː⁺|h͡d̪͡ɮ̪ʷˠ⁺|h͡d̪͡ɮ̪ʷˤ⁺|h͡d̪͡z̪ʲʷ⁺|h͡d̪͡z̪ʷː⁺|h͡d̪͡z̪ʷˠ⁺|..."

I define “classes”, i. e. underspecified segments transcribed in capitals (e. g. P for plosives).

Classes <- PanPhon %>%
  mutate(B = cons == 1 & lab == 1,
        C = cons == 1,
        Č = cons == 1 & delrel == 1 & son == -1 & cont == -1,
        `F` = cons == 1 & cont == 1 & son == -1,
        G = grepl('j|w|ɥ|ɰ', ipa),
        Ł = cons == 1 & cor == 1 & lat == 1,
        L = cons == 1 & cont == 1 & cor == 1 & son == 1,
        N = nas == 1 & syl == -1,
        P = cons == 1 & cont == -1 & delrel == -1 & son == -1,
        R = cont == 1 & son == 1 & syl == -1 & !grepl('h|ɦ', ipa),
        S = cons == 1 & cont == 1 & cor == 1 & son == -1,
        `T` = cons == 1 & son != 1,
        V = cons == -1 & cont == 1 & son == 1 & syl == 1,
        W = syl == -1 & voi == 1,
        X = syl == -1 & voi == -1,
        Z = cont == 1 & syl == -1) %>%
  select(ipa, B, C, Č, `F`, G, Ł, L, N, P, R, S, `T`, V, W, X, Z) %>%
  pivot_longer(cols = -ipa,
     names_to = 'Class',
     values_to = 'Value') %>%
  filter(Value) %>%
  select(-Value)

Classes

I extract phonemes from the phonemic inventories.

Phonemes <- stri_extract_all_regex(Eurasia$Phoneme,
                         pattern = PanPhonRegex,
                         simplify = TRUE) %>%
  as.data.table() %>%
  mutate(Lect = Eurasia$Lect) %>%
  melt(id.vars = 'Lect',
       variable.name = 'Number',
       value.name = 'ipa') %>%
  select(-Number) %>%
  filter(ipa != '')

Phonemes

I subset lect, onsets, nuclei, and codas from Phonotacticon.

LectONC <- Eurasia %>%
  .[, .(Lect, Onset, Nucleus, Coda)] %>%
  melt(id.vars = 'Lect',
       variable.name = 'Category',
       value.name = 'Sequence')

LectONC

I extract the sequences from onset, nucleus, and coda categories.

Sequences <- LectONC[, tstrsplit(Sequence, ' ', fixed = FALSE)] %>%
  .[, c('Lect', 'Category') := .(LectONC$Lect, LectONC$Category)] %>% 
  melt(id.vars = c('Lect', 'Category'),
       variable.name = 'Number',
       value.name = 'Sequence') %>%
  .[, -c('Number')] %>%
  .[!is.na(Sequence)] %>%
  distinct()

Sequences

I subset sequences that include underspecified segments (transcribed in capital letters).

Capitals <-
  Sequences %>%
  .[grepl('B|C|Č|F|G|Ł|L|N|O|P|R|S|T|V|W|X|Z', Sequence)] %>%
  .[, -c('Category')] %>%
  distinct()

Capitals

I convert the capital letters into the corresponding phonemes in each lect. For example, P (“plosive”) in Italian is converted to all the plosive phonemes in Italian phonemic inventory.

Decapitalized <-
  stri_extract_all_regex(Capitals$Sequence,
                         pattern = PanPhonRegex,
                         simplify = TRUE) %>%
  as.data.table() %>%
  .[, c('Lect', 'Sequence') := 
      .(Capitals$Lect, Sequence = Capitals$Sequence)] %>%
  melt(id.vars = c('Lect', 'Sequence'),
        variable.name = 'Order',
       value.name = 'Class') %>%
  .[, Order := as.integer(as.factor(Order))] %>% 
  .[Class != ''] %>%
  merge(Classes, all = TRUE, allow.cartesian = TRUE) %>%
  .[, ipa := if_else(is.na(ipa), Class, ipa)] %>%
  .[, -c('Class')] %>%
  merge(Phonemes) %>%
  setorder(col = Order) %>%
  split(by = c('Lect', 'Sequence')) %>%
  lapply(function(x)
           split(x, by = 'Order')) %>%
  lapply(function(x)
    lapply(x, function(x)
      x <- x$ipa)) %>%
  lapply(function(x)
    expand.grid(x) %>%
      do.call(what = paste0)) %>%
  enframe() %>%
  unnest() %>%
  as.data.table() %>%
  separate(col = name,
           into = c('Lect', 'Sequence'),
           sep = '\\.') %>%
  setnames('value', 'NewSequence') %>%
  merge(Sequences, all = TRUE) %>%
  mutate(Sequence =
           if_else(!is.na(NewSequence),
                   NewSequence,
                   Sequence)) %>%
  .[, -c('NewSequence')]

Decapitalized

I split the sequences into segments, including bracketed segments (such as [ptk] for “/p/, /t/, or /k/.)

ToUnbracket <- stri_extract_all_regex(Decapitalized$Sequence,
                         pattern = PanPhonRegexBrackets,
                         simplify = TRUE) %>%
  as.data.table() %>%
  mutate(Lect = Decapitalized$Lect,
         Category = Decapitalized$Category,
         Sequence = Decapitalized$Sequence) %>%
  melt(id = c('Lect', 'Category', 'Sequence'),
       variable.name = 'Order',
       value.name = 'ipa') %>%
  mutate(Order = Order %>%
           as.factor() %>%
           as.integer()) %>%
  filter(ipa != "")

ToUnbracket

I subset bracketed sequences.

Bracketed <- ToUnbracket %>%
  filter(grepl('\\[', Sequence))

Bracketed

I convert the bracketed sequences into all logically possible sequences. For example, Laven’s sequence [bdɟɡ] [rl] is converted into /br/, /bl/, /dr/, /dl/, /ɟr/ /ɟl/, /ɡr/, and /ɡl/.

Unbracketed <- Bracketed$ipa %>%
  stri_extract_all_regex(pattern = PanPhonRegex, simplify = TRUE) %>%
  as.data.table() %>%
  mutate(Sequence = Bracketed$Sequence,
         Order = Bracketed$Order) %>%
  melt(id.vars = c('Sequence', 'Order'),
       variable.name = 'Number',
       value.name = 'ipa') %>%
  filter(ipa != '') %>%
  select(-Number) %>%
  setorder(col = Order) %>%
  split(by = 'Sequence') %>%
  lapply(function(x)
    split(x, by = 'Order')) %>%
  lapply(function(x)
    lapply(x, function(x)
      x <- x$ipa)) %>%
  lapply(function(x)
    expand.grid(x) %>%
      do.call(what = paste0)) %>%
  enframe() %>%
  unnest() %>%
  setnames(c('name', 'value'),
           c('Sequence', 'NewSequence')) %>% 
  as.data.table()

Unbracketed

I join the unbracketed sequences into the whole list of sequences. Then I split the sequences into segments (e. g. /pl/ into /p/ and /l/).

Segments <-
  stri_extract_all_regex(
  Unbracketed$NewSequence,
  pattern = PanPhonRegex,
  simplify = TRUE) %>%
  as.data.table() %>%
  mutate(Sequence = Unbracketed$Sequence,
         NewSequence = Unbracketed$NewSequence) %>%
    pivot_longer(cols = -c(Sequence, NewSequence),
               names_to = 'Order',
               values_to = 'NewIPA') %>%
    mutate(Order = Order %>%
           as.factor() %>%
           as.integer()) %>%
  filter(NewIPA != '') %>%
  full_join(ToUnbracket) %>%
  mutate(Sequence =
           if_else(
             !is.na(NewSequence),
             NewSequence,
             Sequence),
         ipa =
           if_else(
             !is.na(NewIPA),
             NewIPA,
             ipa)) %>%
  select(-NewSequence, -NewIPA) %>% 
  as.data.table()

## Joining with `by = join_by(Sequence, Order)`

Segments

Measuring the length of sequences

In this section, I will measure the length of each sequence, where length is the number of segments that consist a sequence.

First, I measure the length of each sequence, in terms of the number of segments involved.

Sequences_length <- Segments %>% 
  .[, .(Length = max(Order)), by = .(Lect, Category, Sequence)]

Sequences_length

I join the length of each sequence to segments.

Segments <- left_join(Segments, Sequences_length)

## Joining with `by = join_by(Sequence, Lect, Category)`

Segments

Measuring the distance between sequences

In this section, I will show how I measure the distance between two sequences, e. g. between /pl/ and /spl/.

First, I count the maximal length of all sequences.

MaxLength <- max(Sequences_length$Length)

MaxLength

## [1] 6

I count the number of all the split segments.

Segments_number <- nrow(Segments)

Segments_number

## [1] 81807

In order to measure the distance between two sequences of different length. I assign different “positions” to each sequence. As the maximal length of all sequences is six, a sequence of only one segment has six positions within these six slots (from 0 to 5).

Sequences_rep <- bind_rows(rep(list(Segments), MaxLength))  %>%
  mutate(Position = rep(0:(MaxLength - 1),
                        each = Segments_number)) %>%
  mutate(Order = Order + Position) %>%
  filter(Length + Position <= MaxLength) %>%
  select(-Length)

Sequences_rep

I join segments with their phonological features (retrieved from PanPhon). Each feature is assigned the value of the position.

Sequences_features <- Sequences_rep %>%
  left_join(PanPhon, by = 'ipa') %>%
               melt(id = c('Lect',
                         'Category',
                         'Sequence',
                         'Order',
                         'ipa',
                         'Position'),
               variable.name = 'Feature',
               value.name = 'Value') %>%
  mutate(Feature = paste0(Feature, Order)) %>%
  dcast(Lect + Category + Sequence + Position ~ Feature,
        value.var = 'Value',
        fun.aggregate = sum,
        fill = 0) %>%
  mutate(SequencePosition = paste0(Sequence, Position)) %>%
  select(-Lect, -Category, -Position, -Sequence) %>%
  distinct()

Sequences_features

I then calculate the Saporta distance between each pair of sequences.

Sequences_distance <- Sequences_features %>%
  select(-SequencePosition) %>%
  Dist(method = 'manhattan') %>%
  as.data.table()

Sequences_distance[1:10, 1:10]

In order to name the rows and the columns of the distance matrix, I create a vector of all sequences in different positions.

SequenceVectors <-
  str_replace(Sequences_features$SequencePosition, '[0-9]', '')

head(SequenceVectors, 10)

##  [1] "bl" "bl" "bl" "bl" "bl" "d"  "d"  "d"  "d"  "d"

I then use this vector to name the rows and the columns of the distance matrix.

Sequences_distance[, Sequence := SequenceVectors]

setcolorder(Sequences_distance, 
            c(ncol(Sequences_distance), 1:(ncol(Sequences_distance) - 1)))

setnames(Sequences_distance, c('Sequence', Sequences_features$SequencePosition))

Sequences_distance[1:10, 1:10]

As I have shown above, I need to choose the minimal distance between two sequences mapped onto each other in different positions. Thus I calculate the minimal distance per each sequence pair.

First, I make a vector of all sequences.

AllSequences <- unique(Segments$Sequence)

head(AllSequences)

## [1] "mp" "np" "wp" "jp" "lp" "rp"

I calculate the minimal distance of every sequence to each sequence.

for (i in AllSequences){
  
  csv_name <- paste0('~/Phonotacticon/Sequences/', i, '.csv')
  
  OneSequence <-
    Sequences_distance[Sequence == i] %>% 
    melt(id.vars = 'Sequence',
         variable.name = 'i.Sequence',
         value.name = 'Distance') %>% 
    .[, i.Sequence := gsub('[0-9]', '', i.Sequence)] %>% 
    .[, .(Distance = min(Distance)),
     by = .(i.Sequence)] %>% 
    .[, Sequence := i]
  
  fwrite(OneSequence, csv_name)
}

I make a list of the sequence file names.

Sequences_file_list <- 
  list.files(path = '~/Phonotacticon/Sequences', 
             pattern = '.*')

head(Sequences_file_list)

## [1] "#.csv" "a.csv" "ă.csv" "ã.csv" "ã̰.csv" "a̰.csv"

I read them into a data table.

Sequences_MinDistance <- 
  map_df(Sequences_file_list, ~ read.csv(paste0('~/Phonotacticon/Sequences/', .x))) %>% 
  as.data.table()

Sequences_MinDistance

Sequences that are the most similar to /pl/.

pl <- Sequences_MinDistance %>%
  filter(Sequence == 'pl') %>%
  arrange(Distance)

pl %>% 
  .[1:20] %>% 
  .[, Sequence := NULL] %>% 
  setnames(old = 'i.Sequence', new = 'Sequence') %>% 
  xtable(type = 'latex',
         label = 'pl',
         caption = 'Sequences the most similar to /pl/') %>% 
  booktabs('pl.tex')

pl

As another example, below are shown the sequences that are the most similar to /ia/.

ia <- Sequences_MinDistance %>%
  filter(Sequence == 'ia') %>%
  arrange(Distance)

ia %>% 
  .[1:20] %>% 
  .[, Sequence := NULL] %>% 
  setnames(old = 'i.Sequence', new = 'Sequence') %>% 
  xtable(type = 'latex',
         label = 'ia',
         caption = 'Sequences the most similar to /ia/') %>% 
  booktabs('ia.tex')

ia

Measuring the distance between lects

In this section, I will show how I measure the phonological distance between two lects.

First, I redefine sequences with new (decapitalized and unbracketed) sequences.

NewSequences <- Segments %>%
  select(Lect, Category, Sequence)

NewSequences

I split the data.table of sequences into separate lects.

for (i in Eurasia$Lect){
  csv_name <- paste0('~/Phonotacticon/Lects/', i, '.csv')
  
  Lect <- NewSequences[Lect == i]
  
  fwrite(Lect, csv_name)
}

I assign every onset/nucleus/coda sequence to every other onset/nucleus/coda sequence.

for (i in Eurasia$Lect){
  csv_name <- paste0('~/Phonotacticon/Lects/', i, '.csv')
  
  Lect <- read.csv(csv_name) %>% 
    as.data.table() %>% 
    .[Sequences_MinDistance, on = .(Sequence), nomatch = 0]
  
  fwrite(Lect, csv_name)
}

I calculate the minimal distance between the onset/nucleus/coda inventory of each pair of lects.

for (i in Eurasia$Lect){

  csv_name <- paste0('~/Phonotacticon/Lects/', i, '.csv')

  Lect <- 
    read.csv(csv_name) %>% 
    as.data.table() %>% 
    .[NewSequences,
      on = .(Category, 
             i.Sequence = Sequence),
      allow.cartesian = TRUE] %>% 
    .[, .(Distance = min(Distance)),
      by = .(i.Lect, Sequence, Category)] %>% 
    .[, .(Distance = mean(Distance)),
         by = .(i.Lect, Category)]
  
  Lect$Lect <- i

  fwrite(Lect, csv_name)
  
}

I make a list of .csv file names.

Lects_file_list <- 
  list.files(path = '~/Phonotacticon/Lects', 
             pattern = '.*')

head(Lects_file_list)

## [1] "A'ou.csv"         "Akajeru.csv"      "Amdo Tibetan.csv" "Angami Naga.csv" 
## [5] "Ao Naga.csv"      "Archi.csv"

I read the .csv files into a data.table.

ONC_distance_mean <- 
  map_df(Lects_file_list, ~ read.csv(paste0('~/Phonotacticon/Lects/', .x))) %>% 
  as.data.table()

ONC_distance_mean

In order to create a data.table of lect pairs, I first make a dummy column.

LectDummy <- Lect_LonLat %>% 
  .[, Dummy := 'Dummy']

LectDummy

I create a data.table of lect pairs.

Lect_vs_Lect <- LectDummy %>% 
  .[LectDummy, on = 'Dummy', allow.cartesian = TRUE] %>% 
  .[, Lect_vs_Lect := str_c(pmin(as.character(Lect), as.character(i.Lect)),
                                'vs.',
                                pmax(as.character(Lect), as.character(i.Lect)),
                               sep = ' ')] %>% 
  .[, -c('Dummy')]

Lect_vs_Lect

I assign the lect pair column to the mean distances. I then detect the maximal distance between the Lect A vs. Lect B pair and the Lect B vs. Lect A pair. The result is the onset/nucleus/coda distance between each pair of lects.

ONC_distance <-
  ONC_distance_mean %>% 
  .[Lect_vs_Lect, on = .(Lect, i.Lect)] %>% 
  .[, .(Lect_vs_Lect, Category, Distance)] %>% 
  .[, .(Distance = max(Distance)),
      by = .(Lect_vs_Lect, Category)] %>% 
    dcast(., Lect_vs_Lect ~ Category, value.var = 'Distance') %>% 
  relocate(Lect_vs_Lect, Onset, Nucleus, Coda)

ONC_distance %>% 
  sample_n(5) %>% 
  xtable(type = 'latex',
         label = 'ONCsample',
         caption = 'Five random lect pairs and their onset, nucleus, and coda distances') %>% 
  booktabs('ONC.tex')

ONC_distance

Measuring the distance of tones

Next, I calculate the distance between the tonality of each pair of lects.

First, I count the number of tones in each lect. Below shows the first ten lects.

Tones <- Eurasia %>%
  .[, .(Lect, Tone)] %>%
  .[, Tone := gsub("\\-", NA, Tone)] %>%
  .[, Tone := str_count(Tone, " ") + 1]

Tones[is.na(Tones$Tone),]$Tone <- 0

Tones

I calculate the Canberra distance between the numbers of tones of each pair of lects. Below shows the first ten pairs.

Tones_distance <- Tones %>% 
  .[, -c('Lect')] %>% 
  dist(method = 'canberra') %>%
  as.matrix() %>%
  as.data.table() %>%
  setnames(Tones$Lect) %>%
  .[, Lect := Tones$Lect] %>%
  melt(id = 'Lect',
       variable.name = 'Lect2',
       value.name = 'Tone') %>%
  .[, Tone := replace_na(Tone, 0)] %>%
  .[, Lect_vs_Lect := str_c(pmin(as.character(Lect), as.character(Lect2)),
                                'vs.',
                                pmax(as.character(Lect), as.character(Lect2)),
                               sep = ' ')] %>%
  .[, .(Lect_vs_Lect, Tone)] %>%
  distinct()

Tones_distance

I join segmental distance with tonal distance and normalize the four distances. Below shows the first ten rows.

ONCT_distance <- ONC_distance %>%
  full_join(Tones_distance) %>% 
  select(-Lect_vs_Lect) %>% 
  scale() %>% 
  as.data.table() %>% 
  mutate(Lect_vs_Lect = ONC_distance$Lect_vs_Lect) %>% 
  relocate(Lect_vs_Lect)

## Joining with `by = join_by(Lect_vs_Lect)`

ONCT_distance

Measuring the overall distance

I then calculate the overall distance, which is the Euclidean distance between each pair of lects based on their four normalized distances (onset, nucleus, coda, and tone). Below shows the first ten rows.

PhonoDist <- ONCT_distance %>%
  mutate(Distance = sqrt((Onset - min(Onset)) ^ 2 +
                         (Nucleus - min(Nucleus)) ^ 2 +
                         (Coda - min(Coda)) ^ 2 +
                         (Tone - min(Tone)) ^ 2)) %>%
  select(Lect_vs_Lect, Distance)

PhonoDist %>% 
  arrange(Distance) %>% 
  filter(Distance > 0) %>% 
  head(10) %>% 
  xtable(caption = 'The ten lect pairs with the shortest phonological distance',
         label = 'FirstTenOverall',
         type = 'latex') %>% 
  booktabs('PhonoDist.tex')

fwrite(PhonoDist, 'PhonoDist.csv')

PhonoDist

Ten selected lects:

TenLects <- c('Basque',
              'Evenki',
              'Georgian',
              'Hindi',
              'Japanese',
              'Kazakh',
              'Mandarin Chinese',
              'Sri Lanka Malay',
              'Standard Malay',
              'Tsat')

TenLects

##  [1] "Basque"           "Evenki"           "Georgian"         "Hindi"           
##  [5] "Japanese"         "Kazakh"           "Mandarin Chinese" "Sri Lanka Malay" 
##  [9] "Standard Malay"   "Tsat"

The distances of other lects to each of the ten selected lects:

TenLectsDist <- PhonoDist %>% 
  .[Lect_vs_Lect, on = .(Lect_vs_Lect)] %>% 
  filter(Lect %in% TenLects,
         Lect != i.Lect) %>% 
  .[Phonotacticon, on = .(i.Lect = Lect), nomatch = 0] %>% 
  select(Family, Lect, i.Lect, Distance) %>% 
  arrange(Distance)
  
TenLectsDist

Write each of the ten lects and their twenty closest lects as .tex files:

for (i in TenLects){
  TwentyLects <- 
    TenLectsDist[Lect == i] %>% 
    .[, Lect := NULL] %>% 
    setnames(., 'i.Lect', 'Lect') %>% 
    head(20)
  
  Caption <- paste0('Twenty lects closest to ', i)
  Tex <- paste0(Caption, '.tex')
  
  TwentyLects %>% 
    xtable(label = i, caption = Caption, style = 'latex') %>% 
    booktabs(Tex)
}

Clustering the lects

Based on these distances, I perform two analyses: Multidimensional scaling and k-means clustering, in order to cluster similar lects together and detect areal patterns.

I conduct multidimensional scaling based on the phonological distances, the number of dimensions being maximal, i.e. the number of sample lects minus one. Below shows the first ten pairs of lects and the first five dimensions.

PhonoScale <- PhonoDist %>%
  left_join(Lect_vs_Lect, by = 'Lect_vs_Lect') %>%
  select(Lect, i.Lect, Distance) %>%
  dcast(Lect ~ i.Lect, value.var = 'Distance') %>%
  column_to_rownames('Lect') %>%
  t() %>%
  as.dist() %>%
  cmdscale(k = nrow(.) - 1) %>%
  as.data.frame() %>%
  rownames_to_column('Lect') %>%
  as.data.table()

## Warning in cmdscale(., k = nrow(.) - 1): only 157 of the first 334 eigenvalues
## are > 0

PhonoScale

Write the first five dimensions of the first five lects into a .tex file:

PhonoScale[1:5, 1:6] %>% 
  setNames(c('Lect', as.character(1:5))) %>% 
  xtable(label = 'FiveDimensions', 
       caption = 'First ten lects and their first five dimensions based on multidimensional scaling',
       style = 'LaTeX') %>% 
  booktabs('FiveDimensions.tex')

Next, based on the multidimensional scaling, I will perform k-means clustering of two, five, and ten clusters, in order to see if the clusters formed based on phonological distance consist of areally close lects.

First, I remove the lect column from PhonoScale.

PhonoScaleNumbers <-
  PhonoScale %>%
  .[, !c('Lect')]

PhonoScaleNumbers

I create a data table for clusters.

PhonoClusters <-
  PhonoScale %>%
  .[, c('Lect')] %>% 
  .[Lect_LonLat, on = .(Lect), nomatch = 0]

PhonoClusters

I cluster the lects into two, five, and ten groups.

PhonoClusters$K2 <- 
  PhonoScaleNumbers %>%
    kmeans(2) %>%
    pluck(1) %>%
    as_factor()

PhonoClusters$K5 <- 
  PhonoScaleNumbers %>%
    kmeans(5) %>%
    pluck(1) %>%
    as_factor()

PhonoClusters$K10 <- 
  PhonoScaleNumbers %>%
    kmeans(10) %>%
    pluck(1) %>%
    as.numeric() %>%
    subtract(1) %>% 
    as.factor()
  
PhonoClusters

I assign the two clusters on the map of Eurasia, each integer in different colors representing different clusters.

PhonoK2 <- EurasiaMap +
  geom_text(aes(x = lon,
                 y = lat,
                 label = K2,
                 color = K2),
             data = PhonoClusters,
            show.legend = FALSE) +
  theme(legend.position = 'bottom')

cairo_pdf(file = "PhonoK2.pdf",
          family = "Times New Roman",
          width = 7,
          height = 5)
  PhonoK2
dev.off()

## quartz_off_screen 
##                 2

PhonoK2

Below shows the five clusters on the map.

PhonoK5 <- EurasiaMap +
  geom_text(aes(x = lon,
                 y = lat,
                 label = K5,
                color = K5),
             data = PhonoClusters,
            show.legend = FALSE) +
  theme(legend.position = 'bottom')

cairo_pdf(file = "PhonoK5.pdf",
          family = "Times New Roman",
          width = 7,
          height = 5)
  PhonoK5
dev.off()

## quartz_off_screen 
##                 2

PhonoK5

Below shows the ten clusters on the map.

PhonoK10 <- EurasiaMap +
  geom_text(aes(x = lon,
                 y = lat,
                 label = K10,
                 color = K10),
             data = PhonoClusters,
            show.legend = FALSE) +
  theme(legend.position = 'bottom')

cairo_pdf(file = "PhonoK10.pdf",
          family = "Times New Roman",
          width = 7,
          height = 5)
  PhonoK10
dev.off()

## quartz_off_screen 
##                 2

PhonoK10

Testing the correlation between phonological and geographical distances

I will also test the following hypothesis: Geographical distance correlates with phonological distance. That is, geographically closer lects also tend to be phonologically similar.

I subset Coordinates x.

Coordinates.x <- select(Lect_vs_Lect, lon, lat)

head(Coordinates.x)

I subset Coordinates y.

Coordinates.y <- select(Lect_vs_Lect, i.lon, i.lat)

head(Coordinates.y)

I calculate the geographical distances between two columns of coordinates. I leave out pairs of lects that belong to the same family, as lects belonging to the same family tend to be phonologically similar (by inheritance) and also geographically closer. Below shows the first ten geographical distances.

GeoDist <- Lect_vs_Lect %>%
  mutate(Kilometers =
           distHaversine(Coordinates.x, Coordinates.y) / 1000) %>%
  select(Lect_vs_Lect, Kilometers) %>%
  distinct()

GeoDist

I join the phonological distances to the geographical distances.

PhonoGeoDist <- GeoDist %>% 
  .[PhonoDist, on = .(Lect_vs_Lect), nomatch = 0] %>% 
  .[Lect_vs_Lect, on = .(Lect_vs_Lect), nomatch = 0] %>% 
  filter(Lect != i.Lect)

PhonoGeoDist

I set Burushaski, a language isolate spoken in the middle of Eurasia, as a reference point. The hypothesis is that, the closer a Eurasian Lect is to Burushaski geographically, the closer it is to Burushaski phonologically.

Burushaski <- PhonoGeoDist %>% 
  filter(i.Lect == 'Burushaski') %>% 
  select(-i.Lect, -i.lon, -i.lat)

Burushaski

I subset the coordinates of the Burushaski data.table.

BurushaskiCoords <-
  Burushaski %>% 
  select(lon, lat)

BurushaskiCoords

Among all the lects other than Burushaski, I calculate the greatest distance from a lect to its nearest neighbor.

MaxDistance <- 
  BurushaskiCoords %>% 
  knearneigh(k = 1, longlat = TRUE) %>% 
  knn2nb() %>% 
  nbdists(BurushaskiCoords, longlat = TRUE) %>% 
  unlist() %>% 
  max()
  
MaxDistance

## [1] 1552

I assign the neighbors to each lect, a neighbor defined as a lect within the maximal distance calculated above, so that every lect has at least one neighbor.

WeightMatrix <- 
  BurushaskiCoords %>% 
  dnearneigh(0, MaxDistance, longlat = TRUE) %>% 
  nb2listw(style = 'B')

WeightMatrix

## Characteristics of weights list object:
## Neighbour list object:
## Number of regions: 334 
## Number of nonzero links: 28440 
## Percentage nonzero weights: 25.5 
## Average number of links: 85.1 
## 
## Weights style: B 
## Weights constants summary:
##     n     nn    S0    S1       S2
## B 334 111556 28440 56880 13949472

I perform the Moran’s I test to test the spatial autocorrelation, which shows that the phonological distance patterns are areally clustered.

moran.test(Burushaski$Distance, WeightMatrix)

## 
##  Moran I test under randomisation
## 
## data:  Burushaski$Distance  
## weights: WeightMatrix    
## 
## Moran I statistic standard deviate = 46, p-value <0.0000000000000002
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##         0.2745861        -0.0030030         0.0000366

I perform a spatial regression analysis based on the spatial lag model. The results show that geographical distance to Burushaski is positively correlated with the phonological distance to it.

SpatialLag <- 
  lagsarlm(Distance ~ Kilometers,
         Burushaski,
         listw = WeightMatrix)

summary(SpatialLag)

## 
## Call:lagsarlm(formula = Distance ~ Kilometers, data = Burushaski, 
##     listw = WeightMatrix)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.16356 -0.70537 -0.13814  0.60314  4.45891 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##               Estimate Std. Error z value           Pr(>|z|)
## (Intercept) 1.00723029 0.18620702  5.4092 0.0000000633083141
## Kilometers  0.00031947 0.00004472  7.1438 0.0000000000009077
## 
## Rho: 0.00347, LR test value: 116, p-value: < 0.000000000000000222
## Asymptotic standard error: 0.000297
##     z-value: 11.7, p-value: < 0.000000000000000222
## Wald statistic: 137, p-value: < 0.000000000000000222
## 
## Log likelihood: -469 for lag model
## ML residual variance (sigma squared): 0.968, (sigma: 0.984)
## Number of observations: 334 
## Number of parameters estimated: 4 
## AIC: 946, (AIC for lm: 1060)
## LM test for residual autocorrelation
## test value: 16.1, p-value: 0.000061165

I visualize the phonological distances to Burushaski on a map of Eurasia.

BurushaskiMap <- EurasiaMap +
  geom_text(aes(x = lon,
                 y = lat,
                 color = Distance,
                 label = as.integer(round(Distance))),
             data = Burushaski,
            show.legend = FALSE) +
  geom_point(x = 74.8, y = 36.2, color = 'black', fill = 'red', shape = 21, size = 3) +
  scale_color_gradient(low = 'red', high = 'black')

cairo_pdf(file = "BurushaskiMap.pdf",
          family = "Times New Roman",
          width = 7,
          height = 5)
  BurushaskiMap
dev.off()

## quartz_off_screen 
##                 2

BurushaskiMap

We see that generally, geographically closer lects to Burushaski (red dot) also tend to be phonologically closer to it.

Naive Bayes Classifier

As a follow-up study, I will examine how well machine learning predicts the the area of a lect given its phonological distance from other lects. For example, based on how similar German is to other Eurasian lects, can we predict that it is spoken in Europe?

I first divide the Eurasian lects into seven different regions solely based on their geographical coordinates: Northeast Asia, Mainland Southeast Asia, Qinghai-Gansu, South Asia, West Asia, Caucasus, and Europe.

Areas <- Lect_LonLat %>% 
  mutate(Area = 
           ifelse(lon > 90 & lon < 105 & lat > 32 & lat < 40,
                 'Qinghai-Gansu',
           ifelse(lon > 90 & lat <= 32,
                  'Mainland Southeast Asia',
           ifelse(lon > 60 & lon <= 90 & lat < 40,
                  'South Asia',
           ifelse(lon > 25 & lon < 50 & lat < 50,
                  'West Asia',
           ifelse(lon > -25 & lon < 50 & lat > 30,
                  'Europe',
                  'Northeast Asia')))))) %>% 
  select(-lon, -lat)
                         
Areas

The map visualizes the lects in the predefined seven areas.

AreasLonLat <- Areas %>% 
  left_join(Lect_LonLat)

## Joining with `by = join_by(Lect, Dummy)`

AreaMap <- EurasiaMap +
  geom_point(aes(x = lon,
                 y = lat,
                 color = Area,
                 shape = Area),
             data = AreasLonLat) +
  scale_shape_manual(values = 0:5) +
  theme(legend.position = 'bottom')

cairo_pdf(file = "AreaMap.pdf",
          family = "Times New Roman",
          width = 6,
          height = 5)
  AreaMap
dev.off()

## quartz_off_screen 
##                 2

AreaMap

The goal is to train a model based on phonological distance to see how well it predicts which one of these six areas a lect is spoken.

I split the Lect vs. Lect column of the PhonoDist table into two lects.

Lect_iLect <- str_split_fixed(
  PhonoDist$Lect_vs_Lect, ' vs. ', n = 2) %>%
  as.data.table() %>%
  setnames(c('Lect', 'i.Lect'))

Lect_vs_Lect

I join the distances to the split lect pairs.

PhonoDistSplit <- PhonoDist %>% 
  bind_cols(Lect_iLect) %>% 
  select(-Lect_vs_Lect)

PhonoDistSplit

I double the split distances by switching Lect.x and Lect.y.

PhonoDistDouble <- PhonoDistSplit %>% 
  rename(c('Lect' = 'i.Lect',
           'i.Lect' = 'Lect')) %>% 
  bind_rows(PhonoDistSplit)

PhonoDistDouble

I make a matrix consisting of lects, their distances from all other lects, and their area.

PhonoDistWide <- PhonoDistDouble %>% 
  pivot_wider(names_from = i.Lect,
              values_from = Distance,
              values_fn = sum) %>% 
  left_join(Areas) %>% 
  as.data.table()

## Joining with `by = join_by(Lect)`

setcolorder(PhonoDistWide, 
            c(ncol(PhonoDistWide), 
            1:(ncol(PhonoDistWide) - 1)))

PhonoDistWide

I train the Naive Bayes Classifier based on half of the lects and their distance from other lects. First, I divide the sample in half by each area. (The proportion of the areas is thus equal in the halved sample.) Then I train the classifier in the first half and test it on the other half. Below shows the first ten predictions.

AreaSample <- Areas %>% 
  group_by(Area) %>% 
  slice_sample(prop = 0.5)

AreaSample

I subset the first half of the lects and their distances, which I will train the Naive Bayes Classifier with.

Train <- PhonoDistWide[Lect %in% AreaSample$Lect]

Train

The remaining half will be the lects where the Naive Bayes Classifier will be tested upon to predict their areas.

Test <- PhonoDistWide %>% 
  filter(!(Lect %in% Train$Lect)) %>% 
  select(-Area)

Test

I train the classifier.

Classifier <- naiveBayes(Area ~ ., Train)

Classifier[1]

## $apriori
## Y
##                  Europe Mainland Southeast Asia          Northeast Asia 
##                      13                      70                      18 
##           Qinghai-Gansu              South Asia               West Asia 
##                       4                      47                      14

The trained classifier then predicts the areas of the remaining lects.

Predict <- predict(Classifier, newdata = Test)

Predict

##   [1] Mainland Southeast Asia Mainland Southeast Asia South Asia             
##   [4] Europe                  Northeast Asia          West Asia              
##   [7] Northeast Asia          South Asia              South Asia             
##  [10] Mainland Southeast Asia West Asia               Northeast Asia         
##  [13] Mainland Southeast Asia Mainland Southeast Asia Europe                 
##  [16] Europe                  Northeast Asia          Mainland Southeast Asia
##  [19] Mainland Southeast Asia Mainland Southeast Asia Europe                 
##  [22] Mainland Southeast Asia Northeast Asia          Mainland Southeast Asia
##  [25] Northeast Asia          Mainland Southeast Asia Northeast Asia         
##  [28] Qinghai-Gansu           South Asia              Mainland Southeast Asia
##  [31] Mainland Southeast Asia South Asia              Northeast Asia         
##  [34] Mainland Southeast Asia South Asia              Mainland Southeast Asia
##  [37] South Asia              Mainland Southeast Asia Mainland Southeast Asia
##  [40] Mainland Southeast Asia Europe                  Europe                 
##  [43] Mainland Southeast Asia Europe                  South Asia             
##  [46] Europe                  West Asia               Northeast Asia         
##  [49] Europe                  Mainland Southeast Asia South Asia             
##  [52] Northeast Asia          Mainland Southeast Asia West Asia              
##  [55] Europe                  South Asia              Northeast Asia         
##  [58] South Asia              West Asia               South Asia             
##  [61] Mainland Southeast Asia Mainland Southeast Asia Northeast Asia         
##  [64] Northeast Asia          West Asia               Mainland Southeast Asia
##  [67] Mainland Southeast Asia South Asia              Northeast Asia         
##  [70] Northeast Asia          Mainland Southeast Asia Mainland Southeast Asia
##  [73] Northeast Asia          South Asia              South Asia             
##  [76] Mainland Southeast Asia South Asia              South Asia             
##  [79] West Asia               Mainland Southeast Asia South Asia             
##  [82] South Asia              Mainland Southeast Asia Mainland Southeast Asia
##  [85] South Asia              South Asia              Mainland Southeast Asia
##  [88] Mainland Southeast Asia Mainland Southeast Asia Mainland Southeast Asia
##  [91] West Asia               Europe                  South Asia             
##  [94] Mainland Southeast Asia Mainland Southeast Asia Qinghai-Gansu          
##  [97] South Asia              Europe                  South Asia             
## [100] South Asia              South Asia              South Asia             
## [103] Northeast Asia          Mainland Southeast Asia South Asia             
## [106] Northeast Asia          Mainland Southeast Asia West Asia              
## [109] Mainland Southeast Asia Mainland Southeast Asia Northeast Asia         
## [112] Mainland Southeast Asia South Asia              Northeast Asia         
## [115] Mainland Southeast Asia South Asia              Mainland Southeast Asia
## [118] Northeast Asia          South Asia              South Asia             
## [121] Mainland Southeast Asia West Asia               Northeast Asia         
## [124] Europe                  Mainland Southeast Asia Northeast Asia         
## [127] Mainland Southeast Asia Northeast Asia          South Asia             
## [130] Mainland Southeast Asia Mainland Southeast Asia South Asia             
## [133] West Asia               Mainland Southeast Asia Mainland Southeast Asia
## [136] Northeast Asia          South Asia              Mainland Southeast Asia
## [139] South Asia              West Asia               Europe                 
## [142] Mainland Southeast Asia South Asia              Northeast Asia         
## [145] South Asia              Mainland Southeast Asia Mainland Southeast Asia
## [148] South Asia              Mainland Southeast Asia Northeast Asia         
## [151] South Asia              Northeast Asia          Northeast Asia         
## [154] South Asia              South Asia              Europe                 
## [157] Mainland Southeast Asia Mainland Southeast Asia Europe                 
## [160] Mainland Southeast Asia Mainland Southeast Asia Mainland Southeast Asia
## [163] Qinghai-Gansu           South Asia              Mainland Southeast Asia
## [166] Mainland Southeast Asia Mainland Southeast Asia West Asia              
## [169] Mainland Southeast Asia
## 6 Levels: Europe Mainland Southeast Asia Northeast Asia ... West Asia

I join the predicted areas into the table of tested lects.

AreaPrediction1 <- Test %>% 
  select(Lect) %>% 
  mutate(Prediction = Predict) %>% 
  left_join(Areas) %>% 
  mutate(Correct = Prediction == Area)

## Joining with `by = join_by(Lect)`

AreaPrediction1

I perform the same training and testing, but training with the latter half as the training and testing the former half. Below shows the first ten predictions.

Train2 <- Test %>% 
  left_join(Areas)

## Joining with `by = join_by(Lect, Dummy)`

Test2 <- Train %>% 
  select(-Area)

Classifier <- naiveBayes(Area ~ ., Train2)

Predict2 <- predict(Classifier, newdata = Test2)

AreaPrediction2 <- Test2 %>% 
  select(Lect) %>% 
  mutate(Prediction = Predict2) %>% 
  left_join(Areas) %>% 
  mutate(Correct = Prediction == Area)

## Joining with `by = join_by(Lect)`

AreaPrediction2

I then join the two halves whose areas are predicted based on each other.

AreaPrediction <- bind_rows(AreaPrediction1, AreaPrediction2)

AreaPrediction

I will analyze how correctly the model has predicted the areas based on confusion matrix.

First, I make the predefined areas into factors.

AreaFactors <- as.factor(AreaPrediction$Area)

head(AreaFactors)

## [1] Mainland Southeast Asia Europe                  Europe                 
## [4] Europe                  Mainland Southeast Asia West Asia              
## 6 Levels: Europe Mainland Southeast Asia Northeast Asia ... West Asia

I also make the predicted areas into factors.

PredictionFactors <- factor(AreaPrediction$Prediction,
                            levels = levels(AreaFactors))

head(PredictionFactors)

## [1] Mainland Southeast Asia Mainland Southeast Asia South Asia             
## [4] Europe                  Northeast Asia          West Asia              
## 6 Levels: Europe Mainland Southeast Asia Northeast Asia ... West Asia

Below is the confusion matrix and the related statistics, based on the combination of the two halves of prediction. The accuracy of the predictions is significantly higher than the No Information Rate (p < 0.001). The Kappa value also shows that the model successfully predicts the areas to a moderate degree.

ConfusionMatrix <- 
  confusionMatrix(PredictionFactors,
                AreaFactors,
                mode = 'everything')

ConfusionMatrixOverall <-
  ConfusionMatrix$overall %>% 
  as.matrix() %>% 
  as.data.frame() %>% 
  rownames_to_column()

colnames(ConfusionMatrixOverall) <- c('Name', 'Value')

ConfusionMatrixOverall %>% 
    xtable(label = 'ConfusionMatrix', 
           caption = 'Confusion matrix based on two halves of Naive Bayes Classifier prediction',
           style = 'latex') %>% 
    booktabs('ConfusionMatrix.tex')

ConfusionMatrixOverall

The F1 values of individual classes show that the model predicts some areas better than others, namely Northeast Asia, Mainland Southeast Asia, and South Asia. This may be partly because these areas have more sample lects than others.

F1 <- ConfusionMatrix$byClass %>% 
  as.data.frame() %>% 
  rownames_to_column() %>% 
  rename(c('Class' = 'rowname')) %>% 
  mutate(Class = gsub('Class: ', '', Class)) %>% 
  select(Class, F1)

F1 %>% 
  xtable(label = 'F1',
         caption = 'F1 values of individual classes',
         style = 'latex') %>% 
  booktabs('F1.tex')

F1

Below is the visualization of the lects by their predicted areas.

AreaPredictionLonLat <- AreaPrediction %>% 
  left_join(Lect_LonLat)

## Joining with `by = join_by(Lect, Dummy)`

AreaPredictionMap <- EurasiaMap +
  geom_point(aes(x = lon, 
                 y = lat,
                 color = Prediction,
                 shape = Prediction),
             data = AreaPredictionLonLat) +
  scale_shape_manual(values = 0:6) +
  theme(legend.position = 'bottom')

cairo_pdf(file = "AreaPredictionMap.pdf",
          family = "Times New Roman",
          width = 6,
          height = 5)
  AreaPredictionMap
dev.off()

## quartz_off_screen 
##                 2

AreaPredictionMap

#Grambank

Load Eurasian lects of Grambank

GrambankLects <- read.csv('GrambankLects.csv') %>% 
  as.data.table() %>% 
  .[Macroarea == 'Eurasia'] %>% 
  .[, .(ID, Longitude, Latitude)]

GrambankLects

Load Grambank and subset to lects present in PanPhon

Grambank <- read.csv('Grambank.csv') %>% 
  as.data.table() %>% 
  .[Language_ID %in% GrambankLects$ID] %>% 
  .[, .(Language_ID, Parameter_ID, Value)]

Grambank

Detect non-binary features

NonBinary <- Grambank %>% 
  .[Value == 2] %>% 
  .[, .(Parameter_ID)] %>% 
  distinct() %>% 
  unlist() %>% 
  as.vector()

NonBinary

## [1] "GB203" "GB193" "GB024" "GB025" "GB065" "GB130"

Exclude non-binary features and numeralize the values

GrambankBinary <- Grambank %>% 
  .[!(Parameter_ID %in% NonBinary)] %>% 
  .[, Value := gsub(0, -1, Value)] %>% 
  .[, Value := gsub('\\?', 0, Value)] %>% 
  .[, Value := as.numeric(Value)]

GrambankBinary

Calculate the proportion of unknown values

Unknown <-  GrambankBinary %>%
  .[Value == 0] %>% 
  .[, .N]/nrow(GrambankBinary)

Unknown

## [1] 0.131

Create a vector of glottocodes

Glottocodes <- GrambankBinary$Language_ID %>% 
  unique()

Glottocodes[1:10]

##  [1] "abkh1244" "acha1249" "aghw1237" "aheu1239" "ahom1240" "ainu1240"
##  [7] "aito1238" "akab1249" "akaj1239" "akha1245"

Widen the Grambank data table

GramDist <- GrambankBinary %>% 
  dcast(Language_ID ~ Parameter_ID, value.var = 'Value', fill = 0) %>% 
  .[, Language_ID := NULL] %>% 
  dist(method = 'manhattan')

GramDist[1:10]

##  [1] 153 149 148 156 122 180 122 169 151 148

K-means clustering (2)

GramK2 <- GramDist %>% 
  kmeans(2) %>%
  pluck(1) %>%
  as_factor()

GramK2[1:10]

##  1  2  3  4  5  6  7  8  9 10 
##  2  1  2  1  1  2  1  2  2  1 
## Levels: 1 2

K-means clustering (5)

GramK5 <- GramDist %>% 
  kmeans(5) %>%
  pluck(1) %>%
  as_factor()

GramK5[1:10]

##  1  2  3  4  5  6  7  8  9 10 
##  1  2  1  2  2  5  4  5  4  2 
## Levels: 1 2 3 4 5

K-means clustering (10)

GramK10 <- GramDist %>% 
  kmeans(10) %>%
  pluck(1) %>%
  as.numeric() %>% 
  map_dbl(~ .x - 1) %>% 
  as_factor()

GramK10[1:10]

##  [1] 7 8 7 1 1 2 0 9 0 4
## Levels: 0 1 2 3 4 5 6 7 8 9

Make a data.table of Glottocodes, coordinates, and K5

GramK <-
  data.table(Glottocode = Glottocodes,
             K2 = GramK2,
             K5 = GramK5,
             K10 = GramK10) %>% 
  .[GrambankLects, on = .(Glottocode = ID), nomatch = 0]

GramK

K2 map

GramK2 <- EurasiaMap +
  geom_text(aes(x = Longitude,
                 y = Latitude,
                 label = K2,
                 color = K2),
             data = GramK,
            show.legend = FALSE) +
  theme(legend.position = 'bottom')

cairo_pdf(file = "GramK2.pdf",
          family = "Times New Roman",
          width = 7,
          height = 5)
  GramK2
dev.off()

## quartz_off_screen 
##                 2

GramK2

K5 map

GramK5 <- EurasiaMap +
  geom_text(aes(x = Longitude,
                 y = Latitude,
                 label = K5,
                 color = K5),
             data = GramK,
            show.legend = FALSE) +
  theme(legend.position = 'bottom')

cairo_pdf(file = "GramK5.pdf",
          family = "Times New Roman",
          width = 7,
          height = 5)
  GramK5
dev.off()

## quartz_off_screen 
##                 2

GramK5

K10 map

GramK10 <- EurasiaMap +
  geom_text(aes(x = Longitude,
                 y = Latitude,
                 label = K10,
                 color = K10),
             data = GramK,
            show.legend = FALSE) +
  theme(legend.position = 'bottom')

cairo_pdf(file = "GramK10.pdf",
          family = "Times New Roman",
          width = 7,
          height = 5)
  GramK10
dev.off()

## quartz_off_screen 
##                 2

GramK10

The Glottocodes of the intersecting lects:

Intersect <- Phonotacticon %>% 
  .[Lect %in% Eurasia$Lect] %>% 
  .[Glottocode %in% GrambankBinary$Language_ID] %>% 
  .[, .(Glottocode, Lect)]

Intersect

Subset PhonoDist into intersecting lects and widen it:

PhonoDistIntersect <- PhonoDist %>% 
  .[Lect_vs_Lect, on = .(Lect_vs_Lect)] %>% 
  .[Lect %in% Intersect$Lect & i.Lect %in% Intersect$Lect] %>% 
  .[, .(Lect, i.Lect, Distance)] %>% 
  dcast(Lect ~ i.Lect, value.var = 'Distance') %>% 
  .[, !c('Lect')] %>% 
  as.matrix() %>% 
  unname() %>%
  as.dist()

PhonoDistIntersect[1:10]

##  [1] 2.54 5.28 2.80 2.43 5.70 3.77 5.02 5.73 5.07 5.81

Subset GramDist into intersecting lects and widen it:

GramDistIntersect <- GrambankBinary %>% 
  .[Intersect, on = c(Language_ID = 'Glottocode'), nomatch = NULL] %>% 
  .[, Language_ID := NULL] %>% 
  dcast(Lect ~ Parameter_ID, value.var = 'Value', fill = 0) %>% 
  .[, Lect := NULL] %>% 
  dist(method = 'manhattan')

GramDistIntersect[1:10]

##  [1] 158 116 127 119 154 152 111  58 137 155

Mantel test

mantel(PhonoDistIntersect, GramDistIntersect, method = 'pearson')

## 
## Mantel statistic based on Pearson's product-moment correlation 
## 
## Call:
## mantel(xdis = PhonoDistIntersect, ydis = GramDistIntersect, method = "pearson") 
## 
## Mantel statistic r: 0.215 
##       Significance: 0.001 
## 
## Upper quantiles of permutations (null model):
##    90%    95%  97.5%    99% 
## 0.0368 0.0479 0.0550 0.0646 
## Permutation: free
## Number of permutations: 999

#PhonGramDist

Make Grambank distances into data.table:

GramDistTable <- GramDistIntersect %>% 
  as.matrix() %>% 
  as.data.table() %>% 
  setnames(., Intersect$Lect) %>% 
  .[, Lect := Intersect$Lect] %>% 
  melt(id.vars = 'Lect',
       variable.name = 'i.Lect',
       value.name = 'Distance') %>% 
  .[, GramDist := scale(Distance)] %>% 
  .[, Distance := NULL]
  
GramDistTable

Make Phonotacticon distances into data.table:

PhonoDistTable <- PhonoDistIntersect %>% 
  as.matrix() %>% 
  as.data.table() %>% 
  setnames(., Intersect$Lect) %>% 
  .[, Lect := Intersect$Lect] %>% 
  melt(id.vars = 'Lect',
       variable.name = 'i.Lect',
       value.name = 'Distance') %>% 
  .[, PhonoDist := scale(Distance)] %>% 
  .[, Distance := NULL]

PhonoDistTable

Calculate the mean between the normalized grammatical and phonological distances:

PhonGramDist <- 
  GramDistTable[PhonoDistTable, on = c('Lect', 'i.Lect')] %>% 
  .[, Distance := (PhonoDist + GramDist)/2] %>% 
  .[Lect_vs_Lect, on = c('Lect', 'i.Lect'), nomatch = NULL] %>% 
  .[, .(Lect_vs_Lect, Distance)] %>% 
  distinct() %>% 
  .[, Distance := Distance - min(Distance)]

PhonGramDist

Analyzing phonological distances based on Phonotacticon 1.0

Ian Joo

2024-01-18