local export = {}
local U = mw.ustring.char
local rsub = mw.ustring.gsub
local rmatch = mw.ustring.match
local hbasa = U(0x73C)
local rwaha = U(0x73F)
local zlama_angular = U(0x739)
local zlama_horizontal = U(0x738)
local pthaha = U(0x732)
local zqapha = U(0x735)
local diacritic_vowels = hbasa .. rwaha .. zlama_angular .. zlama_horizontal .. pthaha .. zqapha
local diacritic_vowels_capture = "([" .. diacritic_vowels .. "])"
local talqana_above = U(0x747)
local combining_diaeresis = U(0x308)
-- we declare consonants representing vowels (matres lectionis) as constants to mitigate differences in how mixing
-- right-to-left and left-to-right characters in the same line appears in an IDE vs wiktionary. Since matres is used in
-- concatenation via the .. operator, "ܘ" .. "ܐ" on wiktionary would render as "ܐ" .. "ܘ" in an IDE
local alaph = U(0x710)
local waw = U(0x718)
local yudh = U(0x71D)
local combining_tilde_below = U(0x330)
local combining_tilde_above = U(0x303)
local combining_macron_below = U(0x331)
local combining_macron = U(0x304)
local qushshaya = U(0x741)
local rukkakha = U(0x742)
local combining_breve_below = U(0x32E)
local combining_dot_below = U(0x323)
local combining_dot_above = U(0x307)
local tt_transpose_punc = {
-- left/right single/double quotes
["“"] = "”",
["”"] = "“",
["‘"] = "’",
["’"] = "‘",
["؟"] = "?", -- question mark
["«"] = '“', -- quotation mark
["»"] = '”', -- quotation mark
["،"] = ",", -- comma
["؛"] = ";", -- semicolon
}
local tt_transpose_punc_keys = ''
for key, _ in pairs(tt_transpose_punc) do tt_transpose_punc_keys = tt_transpose_punc_keys .. key end
local fix = {
{ diacritic_vowels_capture .. qushshaya, qushshaya .. "%1"},
-- under the hood mw uses uses NFC which preempts the following two substitutions...
-- feel free to uncomment if there's a test case which can be added that requires them to be uncommented
-- https://www.mediawiki.org/wiki/Unicode_normalization_considerations
-- { diacritic_vowels_capture .. rukkakha, rukkakha .. "%1"},
-- { diacritic_vowels_capture .. combining_tilde_below, combining_tilde_below .. "%1" },
-- partition punctuation marks so "starts with" and "ends with" substitutions work
{"([".. tt_transpose_punc_keys .. "()!.:\"'])", "#%1#"},
}
local tt = {
["ܦ"] = "p",
["ܒ"] = "b",
["ܬ"] = "t",
["ܛ"] = "ṭ",
["ܕ"] = "d",
["ܟ"] = "k",
["ܓ"] = "g",
["ܩ"] = "q",
["ܔ"] = "j",
["ܣ"] = "s",
["ܨ"] = "ṣ",
["ܙ"] = "z",
["ܫ"] = "š",
["ܚ"] = "ḥ",
["ܥ"] = "ʿ",
["ܗ"] = "h",
["ܡ"] = "m",
["ܢ"] = "n",
["ܪ"] = "r",
["ܠ"] = "l",
}
-- local tt_keys = ''
-- for key, _ in pairs(tt) do tt_keys = tt_keys .. key end
local tt_values = ''
for _, value in pairs(tt) do tt_values = tt_values .. value end
local mhagjana_capture = "([" .. rsub('ܗܠܡܢܥܪ', ".", tt) .. alaph .. yudh .. waw .. "])"
local marhetana_capture = "([" .. rsub('ܦܒܬܛܕܟܓܩܔܣܨܙܫܚ', ".", tt) .. "])" -- should gamal garshuni be removed?
-- https://r12a.github.io/scripts/syrc/aii.html#single_letter_words
local bdul = 'ܒܕܘܠ'
local bdul_capture = '([' .. bdul .. '])'
local bdul_capture2 = '([' .. bdul .. '])([' .. bdul .. '])'
-- local alphabet = ''
-- for letter, _ in pairs(tt) do alphabet = alphabet .. letter end
-- alphabet = alphabet .. yudh .. waw .. alaph
-- local alphabet_capture = '([' .. alphabet .. '])'
local tt_next = {
[waw] = "w",
[yudh] = "y",
[rwaha] = "o",
[zlama_angular] = "ē",
[zlama_horizontal] = "i",
[pthaha] = "a",
[zqapha] = "ā"
}
local glides = alaph .. yudh .. waw -- unvoweled, original values of matres lectionis (consonants representing vowels)
local consonants_minus_glides = tt_values .. "cžfḇṯḏḵḡ"
local consonants_capture = "([" .. glides .. consonants_minus_glides .. "])"
local consonants_capture_minus_alaph = "([" .. yudh .. waw .. consonants_minus_glides .. "])"
local vowels_w = "uo"
local vowels_y = "eiēī"
local vowels = vowels_y .. vowels_w .. "aā"
local consonants_and_vowels_capture = "([" .. glides .. consonants_minus_glides .. vowels .. "])"
local special_cases = {
-- { matching_aii_text, latin_substitution }
--
-- the # symbol pads the start and end of a word, consider the follow examples for matching_aii_text
-- #float# only float matches
-- #float words starting with float like float or floats match
-- float# words ending with float like float or afloat match
-- float words containing float like float, floats, afloat and refloats match
{"#ܒܗ" .. combining_dot_above .. "ܝ#", "#bay#"},
{"ܗ" .. combining_dot_above .. "ܝ#", "aya#"},
{"ܗ" .. combining_dot_above .. "ܘ#", "awa#"},
{"ܡ" .. combining_dot_above .. "ܢ#", "man#"},
{"ܡ" .. combining_dot_below .. "ܢ#", "min#"},
{"ܒܵܬܹܐ#", "bāttē#"}, {"ܒܵܬܲܝ̈#", "bāttay#"},
{"ܟ̰ܵܐܝ", "čāy"},
{"ܒܵܐܝ", "bāy"},
{"ܐܲܦ̮ܘܿܟܵܕ", "avokād"},
{"ܝܼܫܘܿܥ#", "īšoʿ#"}, -- starts with vowel but not silent alaph
-- "to be" without inital khwasa, ì
{"#ܝܘܸܢ#", "#ìwen#"}, {"#ܝܘܵܢ#", "#ìwān#"},
{"#ܝܘܲܚ#", "#ìwaḥ#"}, {"#ܝܘܸܬ#", "#ìwet#"},
{"#ܝܘܵܬܝ#", "#ìwāt#"}, {"#ܝܬܘܿܢ#", "#ìton#"},
{"#ܝܠܹܗ#", "#ìlēh#"}, {"#ܝܠܵܗ̇#", "#ìlāh#"},
{"#ܝܢܵܐ#", "#ìnā#"}, {"#ܝܗ݇ܘܵܐ#", "#ìwā#"},
{"#ܝܗ݇ܘܵܘ#", "#ìwā#"},
-- "to be" with inital khwasa, ī
-- https://en.wiktionary.org/wiki/Template:aii-conj-verb/hawe
{"ܝܼܘܸܢ#", "īwen#"}, {"ܝܼܘܵܢ", "īwān"},
{"ܝܼܘܸܬ#", "īwet#"}, {"ܝܼܘܵܬܝ#", "īwāt#"},
{"ܝܼܠܹܗ#", "īlēh#"}, {"ܝܼܠܵܗ̇#", "īlāh#"},
{"ܝܼܘܲܚ#", "īwaḥ#"}, {"ܝܼܬܘܿܢ#", "īton#"}, {"ܝܼܢܵܐ#", "īnā#"},
-- "to be" blends
{"ܝܼܗ݇ܘܵܐ#", "īwā#"}, {"ܝܼܗ݇ܘܵܘ#", "īwā#"},
-- "to be" imperative forms
-- following substitutions starting with '#w' are to pre-empt 'w-' prefixing rule
{"#ܗ݇ܘܝܼ", "#wī"}, {"#ܗ݇ܘܹܝܡܘܼܢ#", "#wēmun#"},
-- "to be" past particles
{"#ܗ݇ܘܵܐ#", "#wā#"},
{"#ܗ݇ܘܵܘ#", "#wā#"},
{"#ܗ݇ܘܹܐ#", "#wē#"},
-- "of" (fixed expressions)
{"#ܕܝܵܡܵܐ#", "#d-yāmā#"}, {"#ܕܠܵܐ#", "#d-lā#"},
-- "to" (fixed expressions)
{"ܠܒܲܕܲܪ#" , "l-baddar#"}, {"ܠܓܵܘܵܐ#", "l-gāwā#"},
-- "per" (fixed expressions with time)
{"ܒܕܲܩܝܼܩܵܐ#", "b-daqīqā#"}, {"ܒܪܦܵܦܵܐ#", "b-rpāpā#"},
{"ܒܝܵܘܡܵܐ#", "b-yāwmā#"}, {"ܒܫܵܥܬܵܐ#", "b-šāʿtā#"},
{"ܒܝܲܪܚܵܐ#", "b-yarḥā#"}, {"ܒܫܵܒ݂ܘܿܥܵܐ#", "b-šāḇoʿā#"},
{"ܒܕܵܘܪܵܐ#", "b-dāwrā#"}, {"ܒܫܹܢ݇ܬܵܐ#", "b-šētā#"},
-- adverbs with clitics (fixed expressions)
{"ܠܩܘܼܪܒܵܐ", "l-qurbā"}, {"ܠܩܘܼܪܒܵܐ ܕ", "l-qurbā d-"},
{"ܒܡܸܬܚܵܐ", "b-mitḥā"},
{"ܒܟܠ#", "b-kul#"}, {"ܕܗܵܘܝܵܐ#", "d-hāwyā#"},
-- "all", "each", "every"
{"ܟܠ#", "kul#"}, {"ܟܠܵܢ#", "kullān#"},
{"ܟܠܘܼܟ݂#", "kulluḵ#"}, {"ܟܠܵܟ݂ܝ#", "kullāḵ#"},
{"ܟܠܹܗ#", "kullēh#"}, {"ܟܠܵܗ̇#", "kullāh#"},
{"ܟܠܘܼܗܝ#", "kulluh#"}, {"ܟܠܘܿܗ̇#", "kulloh#"},
{"ܟܠܲܢ#", "kullan#"}, {"ܟܠܵܘܟ݂ܘܿܢ#", "kullāwḵon#"},
{"ܟܠܵܝܗܝ#", "kullāyh#"}, {"ܟܠܗܘܿܢ#", "kullhon#"},
{"ܟܠܵܢܵܐܝܼܬ#", "kullānāʾīt#"}, {"ܟܠܵܢܵܐܝܼܬ݂#", "kullānāʾīṯ#"},
{"ܟܠܵܢܵܝ", "kullānāy"}, {"ܟܘܿܠܵܝ", "kollāy"},
{"ܟܠܚܲܕ݇#", "kulḥa#"}, {"ܟܠܚܕ݂ܵܐ#", "kulḥḏā#"},
{"ܟܠܫܲܢ݇ܬ#", "kulšat#"},
-- "classical because"
{"ܡܸܛܠ#", "miṭṭul#"}, {"ܡܸܛܠܬܝܼ#", "miṭṭultī#"},
{"ܡܸܛܠܬܘܼܟ݂#", "miṭṭultuḵ#"}, {"ܡܸܛܠܬܵܟ݂ܝ#", "miṭṭultāḵ#"},
{"ܡܸܛܠܬܹܗ#", "miṭṭultēh#"}, {"ܡܸܛܠܬܵܗ̇#", "miṭṭultāh#"},
{"ܡܸܛܠܬܘܼܗܝ#", "miṭṭultuh#"}, {"ܡܸܛܠܬܘܿܗ̇#", "miṭṭultoh#"},
{"ܡܸܛܠܬܲܢ#", "miṭṭultan#"}, {"ܡܸܛܠܬܵܘܟ݂ܘܿܢ#", "miṭṭultāwḵon#"},
{"ܡܸܛܠܬܵܝܗܝ#", "miṭṭultāyh#"}, {"ܡܸܛܠܬܗܘܿܢ#", "miṭṭulthon#"},
{"ܡܸܛܠܵܐܝܼܬ݂#", "miṭṭullāʾīṯ#"},
-- "houses"
{"ܒܵܬܹ̈ܐ#", "bāttē#"}, {"ܒܵܬܲܝ̈#", "bāttay#"},
{"ܒܵܬܝܼ̈#", "bāttī#"}, {"ܒܵܬܲܢ̈#", "bāttān#"},
{"ܒܵܬܘܼ̈ܟ݂#", "bāttuḵ#"}, {"ܒܵܬܵܟ݂ܝ̈#", "bāttāḵ#"},
{"ܒܵܬܘܼ̈ܗܝ#", "bāttuh#"}, {"ܒܵܬ̈ܘܿܗ̇#", "bāttoh#"},
{"ܒܵܬܵܘ̈ܟ݂ܘܿܢ#", "bāttāwḵon#"}, {"ܒܵܬ̈ܗܘܿܢ#", "bātthon#"},
-- popular slang terms
{"ܝܲܐܠܵܗ#", "yallāh#"}, {"ܘܲܐܠܵܗ#", "wallāh#"},
-- feminine imperative forms
{"ܙܹܠ݇ܝ#", "zē#"}, {"ܬܵܐܝ#", "tā#"},
-- 2 past tense forms
{"ܝܼܐ#", "īʾ#"},
}
function export.tr(text, lang, sc)
text = rsub(text, " | ", "# | #")
text = "##" .. rsub(text, " ", "# #") .. "##"
text = rsub(text, "ـ", "")
text = rsub(text, combining_diaeresis, "")
for _, sub in ipairs(fix) do text = rsub(text, unpack(sub)) end
-- Special cases
for _, sub in ipairs(special_cases) do text = rsub(text, unpack(sub)) end
text = rsub(text, "ܟ" .. combining_tilde_below, "č")
text = rsub(text, "ܓ" .. combining_tilde_below, "j")
text = rsub(text, "ܫ" .. combining_tilde_below, "ž")
text = rsub(text, "ܙ" .. combining_tilde_above, "ž")
text = rsub(text, "ܟ" .. combining_tilde_above, "č")
text = rsub(text, "ܫ" .. combining_tilde_above, "ž")
text = rsub(text, "ܦ" .. combining_breve_below, "f")
text = rsub(text, "ܦ" .. qushshaya, "p") -- needs a test case
text = rsub(text, "ܒ" .. qushshaya, "b")
text = rsub(text, "ܬ" .. qushshaya, "t")
text = rsub(text, "ܕ" .. qushshaya, "d")
text = rsub(text, "ܟ" .. qushshaya, "k")
text = rsub(text, "ܓ" .. qushshaya, "g")
text = rsub(text, "ܒ" .. rukkakha, "ḇ")
text = rsub(text, "ܬ" .. rukkakha, "ṯ")
text = rsub(text, "ܕ" .. rukkakha, "ḏ")
text = rsub(text, "ܟ" .. rukkakha, "ḵ")
text = rsub(text, "ܓ" .. rukkakha, "ḡ")
-- this covers b-, d-, w-, l- prefixing for words starting with an alaph
-- https://r12a.github.io/scripts/syrc/aii.html#standalone
-- and ALL special_cases starting with initial_translit_char
local initial_translit_char = 'aī' -- accounts for substituted special cases starting with vowel sound
local initial_char_capture = "([" .. alaph .. initial_translit_char .. "])"
text = rsub(text, "#" .. bdul_capture2 .. initial_char_capture, "#%1-%2-%3")
text = rsub(text, "#" .. bdul_capture .. initial_char_capture, "#%1-%2")
text = rsub(text, waw .. hbasa .. "ܗ" .. combining_dot_above .. "#", "oh#")
text = rsub(text, yudh .. hbasa, "⚹") -- ⚹ is placeholder for later substitution
text = rsub(text, waw .. rwaha, "o")
text = rsub(text, waw .. hbasa, "u")
text = rsub(text, ".", tt_transpose_punc)
text = rsub(text, ".", tt)
text = rsub(text, "#" .. alaph .. "#", "#ʾ#") -- needs a test case
text = rsub(text, consonants_capture .. mhagjana_capture .. combining_macron_below .. consonants_capture, "%1e%2%3")
text = rsub(text, consonants_capture .. marhetana_capture .. combining_macron .. consonants_capture, "%1%2e%3")
text = rsub(text, "([" .. zlama_horizontal .. pthaha .. "])" .. consonants_capture .. diacritic_vowels_capture, "%1%2%2%3")
text = rsub(text, consonants_capture .. talqana_above, "")
text = rsub(text, combining_dot_above, "")
-- yudh+khwasa sandwiched between voweless atootas should sound like [ɪ], <i> not [i], <ī>
text = rsub(text, consonants_capture_minus_alaph .. '⚹' .. consonants_capture .. '([^' .. diacritic_vowels ..'])', "%1i%2%3")
text = rsub(text, "⚹", "ī")
text = rsub(text, consonants_capture .. zlama_angular .. yudh .. consonants_capture, "%1ē%2")
text = rsub(text, consonants_capture .. yudh .. consonants_capture, "%1i%2")
text = rsub(text, "([" .. consonants_minus_glides .. "])" .. yudh .. "#", "%1#")
text = rsub(text, alaph .. pthaha .. waw .. "#", "aw#") -- needs a test case (impossible combination of characters?)
text = rsub(text, alaph .. pthaha .. yudh .. "#", "ay#") -- needs a test case
text = rsub(text, "#" .. alaph .. zlama_angular .. yudh, "#ē") -- needs a test case
text = rsub(text, "#" .. alaph .. yudh, "#ī") -- needs a test case
text = rsub(text, "#" .. yudh .. consonants_capture, "#%1")
text = rsub(text, pthaha .. alaph .. "#", "a#") -- needs a test case
text = rsub(text, zlama_angular .. alaph .. "#", "ē#")
text = rsub(text, zqapha .. alaph .. "#", "ā#") -- needs a test case
text = rsub(text, alaph .. "#", "ā#") -- needs a test case
text = rsub(text, "#" .. alaph, "#")
text = rsub(text, alaph, "ʾ")
text = rsub(text, "#" .. waw .. consonants_and_vowels_capture, "#w-%1")
text = rsub(text, ".", tt_next)
text = rsub(text, "([ēīā])" .. "ʾ" .. consonants_capture, "%1%2")
text = rsub(text, "([" .. vowels_w .. "])([" .. vowels .. "])", "%1w%2") -- needs a test case
text = rsub(text, "([" .. vowels_y .. "])([" .. vowels .. "])", "%1y%2")
text = rsub(text, "ʿʿ", "ʿ") -- needs a test case
text = rsub(text, "ʾʾ", "ʾ") -- needs a test case
text = rsub(text, "-ʾ", "-")
text = rsub(text, "ḇḇ", "ḇ")
text = rsub(text, "ḡḡ", "ḡ")
text = rsub(text, "ḏḏ", "ḏ")
text = rsub(text, "ḵḵ", "ḵ")
text = rsub(text, "p̄p̄", "p̄")
text = rsub(text, "ṯṯ", "ṯ")
-- local bdul_capture = '([bdwl])'
-- text = rsub(text, "#" .. bdul_capture .. "([" .. vowels .. "])", "#%1-%2")
text = rsub(text, "#", "")
if not rmatch(text, "([-" .. vowels .. " ])") then
require("Module:debug").track("aii-translit/lacking diacritics")
return nil
end
return text
end
return export