มอดูล:za-pron

จาก วิกิพจนานุกรม พจนานุกรมเสรี
local export = {}
local find = mw.ustring.find
local gsub = mw.ustring.gsub
local match = mw.ustring.match
local gmatch = mw.ustring.gmatch
local gsplit = mw.text.gsplit
local lower = mw.ustring.lower
local upper = mw.ustring.upper
local len = mw.ustring.len

-- https://en.wikipedia.org/wiki/Standard_Zhuang
-- http://baike.baidu.com/item/%E5%A3%AE%E8%AF%AD/7703463
-- http://www.gxmyw.com.cn/plus/list.php?tid=21

local apost = '‘'

local initialConv = {
	['b']   = 'p',
	['mb']  = 'ɓ',
	['m']   = 'm',
	['f']   = 'f',
	['v']   = 'β',
	['by']  = 'pʲ',
	['my']  = 'mʲ',

	['d']   = 't',
	['nd']  = 'ɗ',
	['n']   = 'n',
	['l']   = 'l',
	['s']   = 'θ',

	['ny']  = 'ɲ',
	['c']   = 'ɕ',
	['y']   = 'j',

	['g']   = 'k',
	['ng']  = 'ŋ',
	['r']   = 'ɣ',
	['gy']  = 'kʲ',
	['ngv'] = 'ŋʷ',
	['gv']  = 'kʷ',

	['']    = 'ʔ',
	['h']   = 'h',
}
	-- [bmfvdnslghrcy]?[gbd]?[vy]?

local ThaiConv = {
	['b']   = 'ป',
	['mb']  = 'บ',
	['m']   = 'ม',
	['f']   = 'ฟ',
	['v']   = 'ว',
	['by']  = 'ปฺย',
	['my']  = 'มฺย',

	['d']   = 'ต',
	['nd']  = 'ด',
	['n']   = 'น',
	['l']   = 'ล',
	['s']   = 'ซ',

	['ny']  = 'ญ',
	['c']   = 'ช',
	['y']   = 'ย',

	['g']   = 'ก',
	['ng']  = 'ง',
	['r']   = 'ฆ',
	['gy']  = 'กฺย',
	['ngv'] = 'งฺว',
	['gv']  = 'กฺว',

	['']    = 'อ',
	['h']   = 'ฮ',
}

local vowelConv = {
	['a']   = { alone = 'a', wfinal = 'aː' },
	['e']   = { alone = 'e', wfinal = 'eː' },
	['i']   = { alone = 'i', wfinal = 'i' },
	['o']   = { alone = 'o', wfinal = 'oː' },
	['u']   = { alone = 'u', wfinal = 'u' },
	['w']   = { alone = 'ɯ', wfinal = 'ɯ' },

	['ai']  = { alone = 'aːi', wfinal = false },
	['ei']  = { alone = 'ei', wfinal = false },
	['oi']  = { alone = 'oːi', wfinal = false },
	['ui']  = { alone = 'uːi', wfinal = false },
	['wi']  = { alone = 'ɯːi', wfinal = false },

	['ae']  = { alone = 'ai', wfinal = 'a' },
	['ie']  = { alone = false, wfinal = 'iː' },
	['oe']  = { alone = false, wfinal = 'o' },
	['ue']  = { alone = false, wfinal = 'uː' },
	['we']  = { alone = false, wfinal = 'ɯː' },

	['au']  = { alone = 'aːu', wfinal = false },
	['aeu'] = { alone = 'au', wfinal = false },
	['eu']  = { alone = 'eːu', wfinal = false },
	['iu']  = { alone = 'iu', wfinal = false },
	['ou']  = { alone = 'ou', wfinal = false },

	['aw']  = { alone = 'aɯ', wfinal = false },
}
	-- [aeiouw][ieu]?[uw]?
	-- w/ final only: [aeiouw]e?
	-- cannot be w/ final: ai, ei, oi, ui, wi, au, aeu, eu, iu, ou, aw // [aeiouw]e?[iuw]
	-- cannot be w/o final: ie, oe, ue // [iou]e

local vowelThaiConv = {
	['a']   = { alone = 'า', wfinal = 'า'},
	['e']   = { alone = 'เ', wfinal = 'เ'},
	['i']   = { alone = 'ี', wfinal = 'ิ' },
	['o']   = { alone = 'โ', wfinal = 'โ'},
	['u']   = { alone = 'ู', wfinal = 'ุ' },
	['w']   = { alone = 'ือ', wfinal = 'ึ' },

	['ai']  = { alone = 'าย', wfinal = false },
	['ei']  = { alone = 'เ็ย', wfinal = false },
	['oi']  = { alone = 'โย', wfinal = false },
	['ui']  = { alone = 'ูย', wfinal = false },
	['wi']  = { alone = 'ืย', wfinal = false },

	['ae']  = { alone = 'ไ', wfinal = 'ั' },
	['ie']  = { alone = false, wfinal = 'ี' },
	['oe']  = { alone = false, wfinal = 'โ็' },
	['ue']  = { alone = false, wfinal = 'ู' },
	['we']  = { alone = false, wfinal = 'ื' },

	['au']  = { alone = 'าว', wfinal = false },
	['aeu'] = { alone = 'เา', wfinal = false },
	['eu']  = { alone = 'เว', wfinal = false },
	['iu']  = { alone = 'ิว', wfinal = false },
	['ou']  = { alone = 'โ็ว', wfinal = false },

	['aw']  = { alone = 'ใ', wfinal = false },
}

local finalConv = {
	['']   = '',
	['m']  = 'm',

	['n']  = 'n',
	['ng'] = 'ŋ',
	['p']  = 'p',
	['b']  = 'p',
	['t']  = 't',
	['d']  = 't',
	['k']  = 'k',
	['g']  = 'k',
}
	-- [mnpbtdkg]?g?

local finalThaiConv = {
	['']   = '',
	['m']  = 'ม',

	['n']  = 'น',
	['ng'] = 'ง',
	['p']  = 'ป',
	['b']  = 'ป',
	['t']  = 'ต',
	['d']  = 'ต',
	['k']  = 'ก',
	['g']  = 'ก',
}

local toneConv = {
	['1']   = '˨˦', --24
	['2']  = '˧˩', --31 z
	['3']  = '˥', --55 j
	['4']  = '˦˨', --42 x
	['5']  = '˧˥', --35 q
	['6']  = '˧', --33 h

	['7']  = '˥', --55
	['7:'] = '˧˥', --35
	['8']  = '˧', --33
}

local toneThaiConv = {
	['1']   = '<sup>จัตวา</sup>', --24
	['2']  = '<sup>อเก</sup>', --31 z
	['3']  = '<sup>ตรี</sup>', --55 j
	['4']  = '<sup>ทโ</sup>', --42 x
	['5']  = '<sup>ตรี</sup>', --35 q
	['6']  = '<sup>สามัญ</sup>', --33 h

	['7']  = '<sup>ตรี</sup>', --55
	['7:'] = '<sup>ตรี</sup>', --35
	['8']  = '<sup>สามัญ</sup>', --33
}

local toneConvToNumbers = {
	['']   = '1',
	['z']  = '2',
	['j']  = '3',
	['x']  = '4',
	['q']  = '5',
	['h']  = '6',
}

local toneConvFromNumbers = {
	['1']  = '',
	['2']  = 'z',
	['3']  = 'j',
	['4']  = 'x',
	['5']  = 'q',
	['6']  = 'h',

	['7']  = '',
	['7:']  = '',
	['8']  = '',
}

local consonantConv_old = {
	['mb']  = 'ƃ',
	['nd']  = 'ƌ',
	['ng']  = 'ŋ',
	['ngv'] = 'ŋv',
}

local vowelConv_old = {
	['oe'] = 'ɵ',
	['ae'] = 'ə',
	['w']  = 'ɯ',
}

local toneConv_old = {
	['1']  = '',
	['2'] = 'ƨ',
	['3'] = 'з',
	['4'] = 'ч',
	['5'] = 'ƽ',
	['6'] = 'ƅ',

	['7']  = '',
	['7:']  = '',
	['8']  = '',
}

-- [[:w:ca:Mòdul:Ru-trans]]
local function reverse(s) -- equivalent a string.reverse, no existeix mw.ustring.reverse
	local ret = ""
	for i = 1, mw.ustring.len(s) do
		ret = mw.ustring.sub(s, i, i) .. ret
	end
	return ret
end

local function fix(text)
	local output = {}

	for word in gmatch(text, apost .. '?[A-Za-z]+[^A-Za-z]*') do
		local apostrophe, word, nonword = match(word, '(' .. apost .. '?)([A-Za-z]+)([^A-Za-z]*)') -- Operate only on strings of Latin letters

		word = gsub(word, '[zjxq]', toneConvToNumbers) -- excludes h which is ambiguously tone or consonant

		-- if CVCV... sequence, it is always CV+CV; CVC+V is expressed via apostrophe
		-- regex (pattern?) wildcards are greedy from the beginning of the string
		-- so counteract this by reversing the string
		-- so if we look for "([CVC])" it will first match what was originally the last CVC sequence
		word = reverse(word)
		word = '|' .. gsub(word, '(g?[mnpbtdkg]?)([ieu]?[uw]?[aeiouwAEIUOUW]+)([vy]?[gbd]?[bmfvdnslghrcyBMFVDNSLGHRCY]?)', '%1%2%3|') -- "+" seems to be needed after "[aiueow]" (combat against greedy question marks, I guess), for example on "daeuz" ("da|euz" otherwise)
		word = reverse(word)
		mw.log('za1>' .. word .. '<preliminary processing>')

		word = gsub(word, '(|)([^aiueow])([^aiueow])([^aiueow]?)([aiueow])', function(x,a,b,c,d) if not initialConv[lower(a..b..c)] then return a..x..b..c..d end end) -- fix bad initial consonant: "|hya"→"h|ya", "|ngya"→"n|gya"
		word = gsub(word, '([aiueow]+)([mnpbtdkg]g?)(|)', function(a,b,x)
			if b ~= '' then -- if final,
				if not match(a, '^[aeiouw]e?$') then -- and vowel sequence is not a sequence that only appears before finals
					mw.log('\t' .. a .. '+' .. b .. '\t<bad')
					return reverse(gsub(reverse(a..b..x), '(|)([^aiueow]+)(e?[aeiouw])', '%1%2%3|')) -- detect valid ...VC sequence at end of string and partition it away
				end
			end
		end)
		word = gsub(word, '|gvu', 'g|vu') -- lwg|vuengz, /kʷu/,/ŋʷu/ does not seem to exist
		mw.log('za2>' .. word .. '<repair>')

		word = gsub(word, 'h|', '6|')
		word = gsub(word, '([A-Za-z]+)|', function(a)
			if match(a, '[ptk]$') then
				return a..'7|'
			elseif match(a, '[bdg]$') and not match(a, 'ng$') then
				return a..'8|'
			else
				return a..'1|'
			end
		end)
		mw.log('za3>' .. word .. '<clarified tones>')

		table.insert(output, apostrophe .. gsub(word, '|', '') .. nonword)
	end

	return table.concat(output)
end

function export.convert(text, scheme, new_bor)
	if type(text) == "table" then
		text, scheme, new_bor = text.args[1], text.args[2], text.args['new_bor']
	end
	local converted = {}

	local extra_pre = match(text, '^[^A-Za-z]*')

	text = fix(text)

	mw.log('za4>' .. text .. '<processed form>')

	for syllable in gmatch(text, '[A-Za-z]+%d[^A-Za-z]*') do
		local initial, vowel, final, tone, extra = match(syllable, '^([BMFVDNSLGHRCYbmfvdnslghrcy]?[gbd]?[vy]?)([AEIOUWaeiouw][ieu]?[uw]?)([mnpbtdkg]?g?)(%d)([^A-Za-z]*)$')
		
		local caps = false
		mw.log('za5>' .. initial, vowel, final, tone, extra)

		if find(initial .. vowel .. final, '[A-Z]') then
			caps = true
			initial, vowel, final = lower(initial), lower(vowel), lower(final)
		end

		if scheme == 'IPA' then
			initial = initialConv[initial]
			vowel = final == '' and vowelConv[vowel].alone or vowelConv[vowel].wfinal
			final = finalConv[final]
			if tone == '7' and find(vowel, 'ː') then
				tone = '7:'
			elseif new_bor and tone == '1' then
				tone = '5'
			end

			tone = toneConv[tone]

			syllable = initial .. vowel .. final .. tone

			table.insert(converted, syllable)
		elseif scheme == 'old' then
			initial = consonantConv_old[initial] or initial
			vowel = gsub(vowel, '[oa]e', vowelConv_old)
			vowel = gsub(vowel, 'w', vowelConv_old)
			final = consonantConv_old[final] or final
			tone = toneConv_old[tone]

			syllable = initial .. vowel .. final .. tone .. extra
			if caps then syllable = gsub(syllable, '^(.)', upper) end

			table.insert(converted, syllable)
		elseif scheme == 'hyphenation' then
			tone = toneConvFromNumbers[tone] -- working backwards……

			extra = gsub(extra, '\'', '')
			syllable = initial .. vowel .. final .. tone .. extra
			if caps then syllable = gsub(syllable, '^(.)', upper) end

			table.insert(converted, syllable)
		elseif scheme == 'tone_numbers' then
			if new_bor and tone == '1' then
				tone = '5'
			end

			extra = gsub(extra, '\'', '')
			syllable = initial .. vowel .. final .. '<sup>' .. tone .. '</sup>' .. extra
			if caps then syllable = gsub(syllable, '^(.)', upper) end

			table.insert(converted, syllable)
		elseif scheme == 'thai' then
			initial = ThaiConv[initial]
			vowel = final == '' and vowelThaiConv[vowel].alone or vowelThaiConv[vowel].wfinal
			final = finalThaiConv[final]
			if tone == '7' and find(vowel, 'ː') then
				tone = '7:'
			elseif new_bor and tone == '1' then
				tone = '5'
			end
			tone = toneThaiConv[tone]
			syllable = initial .. vowel .. final .. tone
			syllable = gsub(syllable, "(.)([เโใไ])", "%2%1") 

			table.insert(converted, syllable)
		else
			error('Convert to what representation?')
		end
	end

	if scheme == 'IPA' then
		converted = '/' .. table.concat(converted, ' ') .. '/'
	elseif scheme == 'old' then
		converted = extra_pre .. table.concat(converted, '')
	elseif scheme == 'hyphenation' then
		converted = gsub(extra_pre .. table.concat(converted, '‧'), ' ', '')
	elseif scheme == 'tone_numbers' then
		converted = extra_pre .. table.concat(converted, '')
	elseif scheme == 'thai' then
		converted = extra_pre .. table.concat(converted, '')
	end

	return converted
end

function export.show(frame)
	local params = {
		[1] = { },
		['dia'] = { },
		['new_bor'] = { type = "boolean" },
	}
	local args = require("Module:parameters").process(frame:getParent().args, params)

	local text, dialect, new_bor = args[1], args['dia'], args['new_bor']
	if not text then text = mw.title.getCurrentTitle().text end

	local ret = {}

	table.insert(
		ret,
		require('Module:accent qualifier').format_qualifiers({dialect and ('จ้วงแบบ' .. dialect) or 'จ้วงมาตรฐาน'}) ..
		' ' ..
		require('Module:IPA').format_IPA_full(
			require('Module:languages').getByCode('za'),
			{
				{
					pron = export.convert(text, 'IPA', new_bor)
				}
			}
		)
	)

	table.insert(
		ret,
		'เลขวรรณยุกต์: ' ..
		export.convert(text, 'tone_numbers', new_bor)
	)

	table.insert(
		ret,
		'การแบ่งพยางค์: ' ..
		export.convert(text, 'hyphenation', new_bor)
	)

	table.insert(
		ret,
		'คำอ่านภาษาไทย (ประมาณ): ' ..
		export.convert(text, 'thai', new_bor)
	)

	return '* ', table.concat(ret, '\n* ')
end

function export.is_latin(frame)
	text = frame.args[1]

	if find(text, '[ƂƃƋƌŊŋƏəƟɵƜɯƧƨЗзЧчƼƽƄƅ]') then
		return ''
	elseif find(text, '[A-Za-z]') then
		return 'y'
	else
		return '' -- CJK is too much of a pain to detect
	end
end

return export