มอดูล:okm-translit

จาก วิกิพจนานุกรม พจนานุกรมเสรี
local export = {}
local gsub = mw.ustring.gsub

local chars_Hani = require('Module:scripts').getByCode('Hani'):getCharacters()
local chars_Hang = require('Module:scripts').getByCode('Hang'):getCharacters()

-- https://github.com/szc126/rime-slg-korean/blob/main/slg_break_jamo.yaml
-- https://github.com/szc126/rime-slg-korean/blob/main/soolegi_yethangeul.custom.yaml
local tt_complex = {
['ᄢ']='ᄇᄉᄀ',
['ᄣ']='ᄇᄉᄃ',
['ᄤ']='ᄇᄉᄇ',
['ᄥ']='ᄇᄉᄉ',
['ᄦ']='ᄇᄉᄌ',
['ᄳ']='ᄉᄇᄀ',
['ᄴ']='ᄉᄉᄉ',
['ꥥ']='ᄅᄀᄀ',
['ꥧ']='ᄅᄃᄃ',
['ꥪ']='ᄅᄇᄇ',
['ꥲ']='ᄇᄉᄐ',
['ꥵ']='ᄉᄉᄇ',
['ꥸ']='ᄌᄌᄒ',
['ᄁ']='ᄀᄀ',
['ᄄ']='ᄃᄃ',
['ᄈ']='ᄇᄇ',
['ᄊ']='ᄉᄉ',
['ᄍ']='ᄌᄌ',
['ᄓ']='ᄂᄀ',
['ᄔ']='ᄂᄂ',
['ᄕ']='ᄂᄃ',
['ᄖ']='ᄂᄇ',
['ᄗ']='ᄃᄀ',
['ᄘ']='ᄅᄂ',
['ᄙ']='ᄅᄅ',
['ᄚ']='ᄅᄒ',
['ᄜ']='ᄆᄇ',
['ᄞ']='ᄇᄀ',
['ᄟ']='ᄇᄂ',
['ᄠ']='ᄇᄃ',
['ᄡ']='ᄇᄉ',
['ᄧ']='ᄇᄌ',
['ᄨ']='ᄇᄎ',
['ᄩ']='ᄇᄐ',
['ᄪ']='ᄇᄑ',
['ᄬ']='ᄫᄫ',
['ᄭ']='ᄉᄀ',
['ᄮ']='ᄉᄂ',
['ᄯ']='ᄉᄃ',
['ᄰ']='ᄉᄅ',
['ᄱ']='ᄉᄆ',
['ᄲ']='ᄉᄇ',
['ᄵ']='ᄉᄋ',
['ᄶ']='ᄉᄌ',
['ᄷ']='ᄉᄎ',
['ᄸ']='ᄉᄏ',
['ᄹ']='ᄉᄐ',
['ᄺ']='ᄉᄑ',
['ᄻ']='ᄉᄒ',
['ᄽ']='ᄼᄼ',
['ᄿ']='ᄾᄾ',
['ᅁ']='ᄋᄀ',
['ᅂ']='ᄋᄃ',
['ᅃ']='ᄋᄆ',
['ᅄ']='ᄋᄇ',
['ᅅ']='ᄋᄉ',
['ᅆ']='ᄋᅀ',
['ᅇ']='ᄋᄋ',
['ᅈ']='ᄋᄌ',
['ᅉ']='ᄋᄎ',
['ᅊ']='ᄋᄐ',
['ᅋ']='ᄋᄑ',
['ᅍ']='ᄌᄋ',
['ᅏ']='ᅎᅎ',
['ᅑ']='ᅐᅐ',
['ᅒ']='ᄎᄏ',
['ᅓ']='ᄎᄒ',
['ᅖ']='ᄑᄇ',
['ᅘ']='ᄒᄒ',
['ᅚ']='ᄀᄃ',
['ᅛ']='ᄂᄉ',
['ᅜ']='ᄂᄌ',
['ᅝ']='ᄂᄒ',
['ᅞ']='ᄃᄅ',
['ꥠ']='ᄃᄆ',
['ꥡ']='ᄃᄇ',
['ꥢ']='ᄃᄉ',
['ꥣ']='ᄃᄌ',
['ꥤ']='ᄅᄀ',
['ꥦ']='ᄅᄃ',
['ꥨ']='ᄅᄆ',
['ꥩ']='ᄅᄇ',
['ꥫ']='ᄅᄫ',
['ꥬ']='ᄅᄉ',
['ꥭ']='ᄅᄌ',
['ꥮ']='ᄅᄏ',
['ꥯ']='ᄆᄀ',
['ꥰ']='ᄆᄃ',
['ꥱ']='ᄆᄉ',
['ꥳ']='ᄇᄏ',
['ꥴ']='ᄇᄒ',
['ꥶ']='ᄋᄅ',
['ꥷ']='ᄋᄒ',
['ꥹ']='ᄐᄐ',
['ꥺ']='ᄑᄒ',
['ꥻ']='ᄒᄉ',
['ꥼ']='ᅙᅙ',

['ᆅ']='@ᅩ@ᅡ@',
['ᆒ']='@ᅮ@ᅥ@',
['ᅹ']='@ᅡ@ᅩ',
['ᆄ']='@ᅩ@ᅡ',
['ᆆ']='@ᅩ@ᅥ',
['ᆑ']='@ᅮ@ᅥ',
['ᆥ']='@ᅥ@ᅡ',
['ᆐ']='@ᅮᅥ@',
['ힳ']='@ᅩᅡ@',
['ힷ']='@ᅮᅡ@',
['ᆁ']='ᅩ@ᅥ@',
['ᆌ']='ᅮ@ᅥ@',
['ᆧ']='ᅩ@ᅡ@',
['ힽ']='ᅵ@ᅡᅩ',
['ힾ']='ᅵ@ᅡ@',
['ퟀ']='ᅵ@ᅥ@',
['ᅤ']='@ᅡ@',
['ᅨ']='@ᅥ@',
['ᅸ']='@ᅡᅩ',
['ᅽ']='@ᅥᅩ',
['ᅾ']='@ᅥᅮ',
['ᆇ']='@ᅩᅩ',
['ᆈ']='@ᅩ@',
['ᆎ']='@ᅮᅡ',
['ᆏ']='@ᅮᅥ',
['ᆓ']='@ᅮᅮ',
['ᆔ']='@ᅮ@',
['ᆤ']='@ᅡᅮ',
['ힲ']='@ᅩᅡ',
['ힴ']='@ᅩᅥ',
['ힸ']='@ᅮᅩ',
['ᆙ']='ᅵ@ᅡ',
['ᆦ']='ᅩ@ᅡ',
['ힰ']='ᅩ@ᅥ',
['ힵ']='ᅮ@ᅥ',
['ힿ']='ᅵ@ᅥ',
['ퟂ']='ᅵ@ᅩ',
['ퟃ']='ᅵ@ᅮ',
['ᅫ']='ᅩᅡ@',
['ᅰ']='ᅮᅥ@',
['ᆀ']='ᅩᅥ@',
['ᆊ']='ᅮᅡ@',
['ᆋ']='ᅮᅥᅳ',
['ᆗ']='ᅳᅵᅮ',
['ힱ']='ᅩᅩᅵ',
['ힶ']='ᅮᅵ@',
['ힻ']='ᅳᅥ@',
['ퟁ']='ᅵᅩᅵ',
['ퟆ']='ᆞᅥ@',
['ᅣ']='@ᅡ',
['ᅧ']='@ᅥ',
['ᅭ']='@ᅩ',
['ᅲ']='@ᅮ',
['ᅢ']='ᅡ@',
['ᅦ']='ᅥ@',
['ᅪ']='ᅩᅡ',
['ᅬ']='ᅩ@',
['ᅯ']='ᅮᅥ',
['ᅱ']='ᅮ@',
['ᅴ']='ᅳ@',
['ᅶ']='ᅡᅩ',
['ᅷ']='ᅡᅮ',
['ᅺ']='ᅥᅩ',
['ᅻ']='ᅥᅮ',
['ᅼ']='ᅥᅳ',
['ᅿ']='ᅩᅥ',
['ᆂ']='ᅩᅩ',
['ᆃ']='ᅩᅮ',
['ᆉ']='ᅮᅡ',
['ᆍ']='ᅮᅮ',
['ᆕ']='ᅳᅮ',
['ᆖ']='ᅳᅳ',
['ᆘ']='ᅵᅡ',
['ᆚ']='ᅵᅩ',
['ᆛ']='ᅵᅮ',
['ᆜ']='ᅵᅳ',
['ᆝ']='ᅵᆞ',
['ᆟ']='ᆞᅥ',
['ᆠ']='ᆞᅮ',
['ᆡ']='ᆞ@',
['ᆢ']='ᆞᆞ',
['ᆣ']='ᅡᅳ',
['ힹ']='ᅳᅡ',
['ힺ']='ᅳᅥ',
['ힼ']='ᅳᅩ',
['ퟄ']='ᅵ@',
['ퟅ']='ᆞᅡ',

['ᇄ']='ᆨᆺᆨ',
['ᇌ']='ᆯᆨᆺ',
['ᇏ']='ᆯᆮᇂ',
['ᇑ']='ᆯᆷᆨ',
['ᇒ']='ᆯᆷᆺ',
['ᇓ']='ᆯᆸᆺ',
['ᇔ']='ᆯᆸᇂ',
['ᇖ']='ᆯᆺᆺ',
['ᇞ']='ᆷᆺᆺ',
['ᇭ']='ᇰᆨᆨ',
['ퟎ']='ᆮᆮᆸ',
['ퟑ']='ᆮᆺᆨ',
['ퟕ']='ᆯᆨᆨ',
['ퟖ']='ᆯᆨᇂ',
['ퟗ']='ᆯᆯᆿ',
['ퟘ']='ᆯᆷᇂ',
['ퟙ']='ᆯᆸᆮ',
['ퟚ']='ᆯᆸᇁ',
['ퟜ']='ᆯᇹᇂ',
['ퟟ']='ᆷᆫᆫ',
['ퟡ']='ᆷᆸᆺ',
['ퟤ']='ᆸᆯᇁ',
['ퟧ']='ᆸᆺᆮ',
['ퟬ']='ᆺᆺᆨ',
['ퟭ']='ᆺᆺᆮ',
['ퟸ']='ᆽᆸᆸ',
['ᆩ']='ᆨᆨ',
['ᆪ']='ᆨᆺ',
['ᆬ']='ᆫᆽ',
['ᆭ']='ᆫᇂ',
['ᆰ']='ᆯᆨ',
['ᆱ']='ᆯᆷ',
['ᆲ']='ᆯᆸ',
['ᆳ']='ᆯᆺ',
['ᆴ']='ᆯᇀ',
['ᆵ']='ᆯᇁ',
['ᆶ']='ᆯᇂ',
['ᆹ']='ᆸᆺ',
['ᆻ']='ᆺᆺ',
['ᇃ']='ᆨᆯ',
['ᇅ']='ᆫᆨ',
['ᇆ']='ᆫᆮ',
['ᇇ']='ᆫᆺ',
['ᇈ']='ᆫᇫ',
['ᇉ']='ᆫᇀ',
['ᇊ']='ᆮᆨ',
['ᇋ']='ᆮᆯ',
['ᇍ']='ᆯᆫ',
['ᇎ']='ᆯᆮ',
['ᇐ']='ᆯᆯ',
['ᇕ']='ᆯᇦ',
['ᇗ']='ᆯᇫ',
['ᇘ']='ᆯᆿ',
['ᇙ']='ᆯᇹ',
['ᇚ']='ᆷᆨ',
['ᇛ']='ᆷᆯ',
['ᇜ']='ᆷᆸ',
['ᇝ']='ᆷᆺ',
['ᇟ']='ᆷᇫ',
['ᇠ']='ᆷᆾ',
['ᇡ']='ᆷᇂ',
['ᇣ']='ᆸᆯ',
['ᇤ']='ᆸᇁ',
['ᇥ']='ᆸᇂ',
['ᇧ']='ᆺᆨ',
['ᇨ']='ᆺᆮ',
['ᇩ']='ᆺᆯ',
['ᇪ']='ᆺᆸ',
['ᇬ']='ᇰᆨ',
['ᇮ']='ᇰᇰ',
['ᇯ']='ᇰᆿ',
['ᇱ']='ᇰᆺ',
['ᇲ']='ᇰᇫ',
['ᇳ']='ᇁᆸ',
['ᇵ']='ᇂᆫ',
['ᇶ']='ᇂᆯ',
['ᇷ']='ᇂᆷ',
['ᇸ']='ᇂᆸ',
['ᇺ']='ᆨᆫ',
['ᇻ']='ᆨᆸ',
['ᇼ']='ᆨᆾ',
['ᇽ']='ᆨᆿ',
['ᇾ']='ᆨᇂ',
['ᇿ']='ᆫᆫ',
['ퟋ']='ᆫᆯ',
['ퟌ']='ᆫᆾ',
['ퟍ']='ᆮᆮ',
['ퟏ']='ᆮᆸ',
['ퟐ']='ᆮᆺ',
['ퟒ']='ᆮᆽ',
['ퟓ']='ᆮᆾ',
['ퟔ']='ᆮᇀ',
['ퟛ']='ᆯᇰ',
['ퟞ']='ᆷᆫ',
['ퟠ']='ᆷᆷ',
['ퟢ']='ᆷᆽ',
['ퟣ']='ᆸᆮ',
['ퟥ']='ᆸᆷ',
['ퟦ']='ᆸᆸ',
['ퟨ']='ᆸᆽ',
['ퟩ']='ᆸᆾ',
['ퟪ']='ᆺᆷ',
['ퟫ']='ᆺᇦ',
['ퟮ']='ᆺᇫ',
['ퟯ']='ᆺᆽ',
['ퟰ']='ᆺᆾ',
['ퟱ']='ᆺᇀ',
['ퟲ']='ᆺᇂ',
['ퟳ']='ᇫᆸ',
['ퟴ']='ᇫᇦ',
['ퟵ']='ᇰᆷ',
['ퟶ']='ᇰᇂ',
['ퟷ']='ᆽᆸ',
['ퟹ']='ᆽᆽ',
['ퟺ']='ᇁᆺ',
['ퟻ']='ᇁᇀ',

-- compatibility jamo
['ㅩ']='ᄅᄀᄉ',
['ㅫ']='ᄅᄇᄉ',
['ㅴ']='ᄇᄉᄀ',
['ㅵ']='ᄇᄉᄃ',
['ㄲ']='ᄀᄀ',
['ㄸ']='ᄃᄃ',
['ㅃ']='ᄇᄇ',
['ㄳ']='ᄀᄉ',
['ㄵ']='ᄂᄌ',
['ㄶ']='ᄂᄒ',
['ㄺ']='ᄅᄀ',
['ㄻ']='ᄅᄆ',
['ㄼ']='ᄅᄇ',
['ㄽ']='ᄅᄉ',
['ㄾ']='ᄅᄐ',
['ㄿ']='ᄅᄑ',
['ㅀ']='ᄅᄒ',
['ㅄ']='ᄇᄉ',
['ㅆ']='ᄉᄉ',
['ㅉ']='ᄌᄌ',
['ㅥ']='ᄂᄂ',
['ㅦ']='ᄂᄃ',
['ㅧ']='ᄂᄉ',
['ㅨ']='ᄂᅀ',
['ㅪ']='ᄅᄃ',
['ㅬ']='ᄅᅀ',
['ㅭ']='ᄅᅙ',
['ㅮ']='ᄆᄇ',
['ㅯ']='ᄆᄉ',
['ㅰ']='ᄆᅀ',
['ㅲ']='ᄇᄀ',
['ㅳ']='ᄇᄃ',
['ㅶ']='ᄇᄌ',
['ㅷ']='ᄇᄐ',
['ㅹ']='ᄫᄫ',
['ㅺ']='ᄉᄀ',
['ㅻ']='ᄉᄂ',
['ㅼ']='ᄉᄃ',
['ㅽ']='ᄉᄇ',
['ㅾ']='ᄉᄌ',
['ㆀ']='ᄋᄋ',
['ㆂ']='ᅌᄉ',
['ㆃ']='ᅌᅀ',
['ㆅ']='ᄒᄒ',
['ㄱ']='ᄀ',
['ㄴ']='ᄂ',
['ㄷ']='ᄃ',
['ㄹ']='ᄅ',
['ㅁ']='ᄆ',
['ㅂ']='ᄇ',
['ㅅ']='ᄉ',
['ㅇ']='ᄋ',
['ㅈ']='ᄌ',
['ㅊ']='ᄎ',
['ㅋ']='ᄏ',
['ㅌ']='ᄐ',
['ㅍ']='ᄑ',
['ㅎ']='ᄒ',
['ㅤ']='ᅟ', -- filler
['ㅱ']='ᄝ',
['ㅸ']='ᄫ',
['ㅿ']='ᅀ',
['ㆁ']='ᅌ',
['ㆄ']='ᅗ',
['ㆆ']='ᅙ',

['ㆈ']='@ᅩ@ᅡᅵ',
['ㆋ']='@ᅮ@ᅥᅵ',
['ㆇ']='@ᅩ@ᅡ',
['ㆊ']='@ᅮ@ᅥ',
['ㅒ']='@ᅡᅵ',
['ㅖ']='@ᅥᅵ',
['ㅙ']='ᅩᅡᅵ',
['ㅞ']='ᅮᅥᅵ',
['ㆉ']='@ᅩᅵ',
['ㆌ']='@ᅮᅵ',
['ㅐ']='ᅡᅵ',
['ㅑ']='@ᅡ',
['ㅔ']='ᅥᅵ',
['ㅕ']='@ᅥ',
['ㅘ']='ᅩᅡ',
['ㅚ']='ᅩᅵ',
['ㅛ']='@ᅩ',
['ㅝ']='ᅮᅥ',
['ㅟ']='ᅮᅵ',
['ㅠ']='@ᅮ',
['ㅢ']='ᅳᅵ',
['ㅏ']='ᅡ',
['ㅓ']='ᅥ',
['ㅗ']='ᅩ',
['ㅜ']='ᅮ',
['ㅡ']='ᅳ',
['ㅣ']='ᅵ',
['ㆍ']='ᆞ',
}

local tt = [==[
BREAK	1

# remove hanja from (ex.) 사뎐(辭典)
# caps prob. isn't necessary since the "base" text is actually hangeul?
# Hani regex is a reasonable subset of Hani from [[Module:scripts/data]],
# last checked on 20220221
%([一-鿿㐀-䶿𠀀-𮯯𰀀-𱍏]+%)	×

# to yale

# non-simple
gᄋ	Ğ # voiced velar fricative /ɣ/
ᄋᄋ	Ő
@ᅮ	yu
@ᅩ	yo
ᅩᅡ	wa
ᅮᅥ	we
ᅵᆞ	yo
ᆞᆞ	yo

# choseong
ᄀ	K
ᄂ	N
ᄃ	T
ᄅ	L
ᄆ	M
ᄇ	P
ᄉ	S
ᄋ	Ø
ᄌ	C
ᄎ	CH
ᄏ	KH
ᄐ	TH
ᄑ	PH
ᄒ	H
ᄝ	◆
ᄫ	Ƃ
ᅗ	◆
ᄛ	◆
ᅌ	Ŋ
ᅀ	Z
ᅙ	Q
ᄼ	◆
ᅎ	◆
ᅔ	◆
ᄾ	◆
ᅐ	◆
ᅕ	◆
ᅟ	× # filler

# jungseong
@	y
ᅡ	a
ᅥ	e
ᅩ	wo
ᅮ	wu
ᅳ	u
ᅵ	i
ᆞ	o
ᅠ	× # filler

# jongseong
ᆨ	k
ᆫ	n
ᆮ	t
ᆯ	l
ᆷ	m
ᆸ	p
ᆺ	s
ᆼ	ø
ᆽ	c
ᆾ	ch
ᆿ	kh
ᇀ	th
ᇁ	ph
ᇂ	h
ᇢ	◆
ᇦ	ƃ
ᇴ	◆
ퟝ	◆
ᇰ	ŋ
ᇫ	z
ᇹ	q

# tone
〮	↑
〯	→

# tone diacritic location
([aiueo]+)([y]?)([↑→↓])	%1%3%2

# hyphens within syllables
# CV-y
# CVC-C
# CV-C
# C-V
%-%-%-%-(.-[wyaiueo↑→↓]+)(y)	%1-%2
%-%-%-(.-[wyaiueo↑→↓]+[^wyaiueo ])([^wyaiueo ])	%1-%2
%-%-%-(.-[wyaiueo↑→↓]+)	%1-
%-%-(.-)([wyaiueo])	%1-%2

# 子(ᄌᆞ)ㅣ
(%))(%-?)i	%1%2y

Ø	×

BREAK	2

↑	́
→	̌
↓	̀

ğ	G
ő	OO
Ø	NG # capitalized hanja readings
ø	ng
ƃ	W
Ŋ	NG # capitalized hanja readings
ŋ	ng
]==]

tt = mw.text.trim(tt)
tt = mw.ustring.gsub(tt, '%s*#[^\n]+', '') -- remove comments
tt = mw.ustring.gsub(tt, '\n+', '\n') -- remove empty lines

local a, b, c, d = 'ᄀᄂᄃᄅᄆᄇᄉᄋᄌᄎᄏᄐᄑᄒᄝᄫᅗᄛᅌᅀᅙᄼᅎᅔᄾᅐᅕᅟ', '@ᅡᅥᅩᅮᅳᅵᆞᅠ', 'ᆨᆫᆮᆯᆷᆸᆺᆼᆽᆾᆿᇀᇁᇂᇢᇦᇴퟝᇰᇫᇹ', '〮〯'

function export.tr(text, lang, sc)
	text = gsub(text, "%<%/?r[pt]%>", "")
	text = gsub(text, "%<%/?ruby%>", "")

	if not mw.ustring.match(text, '[' .. chars_Hang .. ']') then
		return nil
	end

	local bool_tone_marking = mw.ustring.find(text, ('[%s]'):format(d))

	text = mw.ustring.toNFD(text)

	text = mw.ustring.gsub(text, '.', tt_complex)

	for line in mw.text.gsplit(tt, '\n') do
		local _, __, pattern, repl = mw.ustring.find(line, '(.+)\t(.+)')

		if pattern .. repl == 'BREAK1' then
			-- add period between hanja readings
			text = mw.ustring.gsub(text, '([' .. chars_Hani .. '])%((.-)%)', function(hanja, reading)
				return hanja .. '(' .. mw.ustring.gsub(reading, ('([%s]+)'):format(a), '.%1') .. ')'
			end)

			if bool_tone_marking then
				-- move the location of tone marks for easier handling and
				-- mark low tone
				text = mw.ustring.gsub(text, ('([%s]+)([%s]+)([%s]*)([%s]*)'):format(a, b, c, d), function(a, b, c, d)
					return a .. b .. (d == '' and '↓' or d) .. (c == '' and '' or c)
				end)
			end
		elseif pattern .. repl == 'BREAK2' then
			text = mw.ustring.lower(text)

			-- hanja readings
			-- ref. [[Module:Ethi-translit]]
			text = mw.ustring.gsub(text, '()([' .. chars_Hani .. ']+)%((.-)%)()', function(start_pos, hanja, reading, end_pos)
				-- treat final ieung as null if tones are marked (is this a safe assumption?)
				if bool_tone_marking then
					reading = mw.ustring.gsub(reading, 'ø', '')
				end
				-- convert to uppercase
				reading = mw.ustring.upper(reading)
				return reading
			end)
			-- remove hanja reading leading period
			text = mw.ustring.gsub(text, '^%.', '')
			text = mw.ustring.gsub(text, "'''%.", "'''")
			text = mw.ustring.gsub(text, '(%s)%.', '%1')
		else
			if repl == '×' then
				repl = ''
			end
			text = mw.ustring.gsub(text, pattern, repl)
		end
	end

	-- track failed romanizations
	-- (black diamond instead of U+FFFD to avoid warnings when saving this page)
	if mw.ustring.match(text, '◆') then
		require('Module:debug').track('okm-translit/failed romanization')
	end

	return text
end

return export