local concat = table.concat
local get_etym_lang = require("Module:etymology languages").getByCanonicalName
local gsub = mw.ustring.gsub
local insert = table.insert
local split = mw.text.split
local trim = mw.text.trim
local u = mw.ustring.char
local function track(track_id)
local tracking_page = "headword/" .. track_id
local m_debug_track = require("Module:debug/track")
m_debug_track(tracking_page)
return true
end
local frame = mw.getCurrentFrame()
local title = mw.title.getCurrentTitle()
local content = title:getContent()
:gsub("<!%-%-.-%-%->", "")
:gsub("<!%-%-.*", "")
local content_lang = mw.getContentLanguage()
local data = {}
------ 1. Lists that will be converted into sets. ------
data.invariable = {
"ชมาโว", --cmavo (Lojban)
"ชเมเน", --cmene (Lojban)
"ฟูฮิฝลา", --fu'ivla (Lojban)
"กิสมู", --gismu (Lojban)
"ฮั้นถื่อ", --Han tu
"ฮั่นจื้อ", --hanzi
"ฮันจา", --hanja
"ยฺหวืดเพ็ง", --jyutping
"คันจิ", --kanji
"ลุฌโว", --lujvo (Lojban)
"phrasebook",
"พินอิน", --pinyin
"รัฟซี", --rafsi (Lojban)
"โรมาจิ", --romaji
}
data.lemmas = {
"คำย่อ", --abbreviations
"acronyms",
"คำคุณศัพท์", --adjectives
"adnominals",
"adpositions",
"คำกริยาวิเศษณ์", --adverbs
"affixes",
"ambipositions",
"คำกำกับนาม", --articles
"circumfixes",
"circumpositions",
"คำลักษณนาม", --classifiers
"ชมาโว", --cmavo
"cmavo clusters",
"ชเมเน", --cmene
"combining forms",
"คำสันธาน", --conjunctions
"คำลักษณนาม", --counters = classifiers
"คำกำหนด", --determiners
"diacritical marks",
"ทวิอักษร", --digraphs
"equative adjectives",
"ฟูฮิฝลา", --fu'ivla
"กิสมู", --gismu
"อักษรจีน", --Han characters
"ฮั้นถื่อ", --Han tu
"ฮั่นจื้อ", --hanzi
"ฮันจา", --hanja
"ideophones",
"สำนวน", --idioms
"อาคม", --infixes
"อักษรย่อ", --initialisms
"interfixes",
"คำอุทาน", --interjections
"คันจิ", --kanji
"ตัวอักษร", --letters
"ตัวอักษรควบ", --ligatures
"logograms",
"ลุฌโว", --lujvo
"หน่วยคำ", --morphemes
"non-constituents",
"คำนาม", --nouns
"จำนวน", --numbers
"ตัวเลข", --numeral symbols
"เลข", --numerals
"คำอนุภาค", --particles
"วลี", --phrases
"คำปัจฉบท", --postpositions
"postpositional phrases",
"predicatives",
"อุปสรรค", --prefixes
"prepositional phrases",
"คำบุพบท", --prepositions
"preverbs",
"pronominal adverbs",
"คำสรรพนาม", --pronouns
"คำวิสามานยนาม", --proper nouns
"สุภาษิต", --proverbs
"เครื่องหมายวรรคตอน", --punctuation marks
"relatives",
"ราก", --roots
"stems",
"ปัจจัย", --suffixes
"พยางค์", --syllables
"สัญลักษณ์", --symbols
"คำกริยา", --verbs
}
data.nonlemmas = {
"active participle forms",
"active participles",
"adjectival participles",
"adjective case forms",
"รูปผันคำคุณศัพท์", --adjective forms
"adjective feminine forms",
"รูปผันคำคุณศัพท์พหูพจน์", --adjective plural forms
"รูปผันคำกริยาวิเศษณ์", --adverb forms
"adverbial participles",
"agent participles",
"รูปผันคำกำกับนาม", --article forms
"circumfix forms",
"combined forms",
"comparative adjective forms",
"คำคุณศัพท์ขั้นกว่า", --comparative adjectives
"comparative adverb forms",
"คำกริยาวิเศษณ์ขั้นกว่า", --comparative adverbs
"รูปผันคำสันธาน", --conjunction forms
"contractions",
"converbs",
"รูปผันคำกำหนดขั้นกว่า", --determiner comparative forms
"รูปผันคำกำหนด", --determiner forms
"รูปผันคำกำหนดขั้นสุด", --determiner superlative forms
"คำนามบอกความเล็ก", --diminutive nouns
"equative adjective forms",
"equative adjectives",
"future participles",
"gerunds",
"infinitive forms",
"infinitives",
"รูปผันคำอุทาน", --interjection forms
"ยฺหวืดเพ็ง", --jyutping
"kanji readings",
"misspellings",
"negative participles",
"nominal participles",
"noun case forms",
"รูปผันคำนามทวิพจน์", --noun dual forms
"รูปผันคำนาม", --noun forms
"noun paucal forms",
"รูปผันคำนามพหูพจน์", --noun plural forms
"noun possessive forms",
"noun singulative forms",
"รูปผันเลข", --numeral forms
"participles",
"participle forms",
"รูปผันคำอนุภาค", --particle forms
"passive participles",
"past active participles",
"รูปผันอดีตกาลสมบูรณ์", --past participles
--"รูปผันอดีตกาลสมบูรณ์", --past participle forms
"past passive participles",
"perfect active participles",
"perfect participles",
"perfect passive participles",
"พินอิน", --pinyin
"พหูพจน์", --plurals
"รูปผันคำปัจฉบท", --postposition forms
"รูปผันอุปสรรค", --prefix forms
"preposition contractions",
"รูปผันคำบุพบท", --preposition forms
"prepositional pronouns",
"present active participles",
"รูปผันปัจจุบันกาลสมบูรณ์", --present participles
"present passive participles",
"รูปผันคำสรรพนาม", --pronoun forms
"pronoun possessive forms",
"รูปผันคำวิสามานยนาม", --proper noun forms
"รูปผันคำวิสามานยนามพหูพจน์", --proper noun plural forms
"รัฟซี", --rafsi
"การถอดเป็นอักษรโรมัน", --romanizations
"root forms",
"singulatives",
"รูปผันปัจจัย", --suffix forms
"superlative adjective forms",
"คำคุณศัพท์ขั้นสุด", --superlative adjectives
"superlative adverb forms",
"คำกริยาวิเศษณ์ขั้นสุด", --superlative adverbs
"รูปผันคำกริยา", --verb forms
"verbal nouns",
}
-- These langauges will not have links to separate parts of the headword.
data.no_multiword_links = {
"zh",
}
-- These languages will not have "LANG multiword terms" categories added.
data.no_multiword_cat = {
-------- Languages without spaces between words (sometimes spaces between phrases) --------
"blt", -- Tai Dam
"ja", -- Japanese
"khb", -- Lü
"km", -- Khmer
"lo", -- Lao
"mnw", -- Mon
"my", -- Burmese
"nan", -- Min Nan (some words in Latin script; hyphens between syllables)
"nan-hbl", -- Hokkien (some words in Latin script; hyphens between syllables)
"nod", -- Northern Thai
"ojp", -- Old Japanese
"shn", -- Shan
"sou", -- Southern Thai
"tdd", -- Tai Nüa
"th", -- Thai
"tts", -- Isan
"twh", -- Tai Dón
"txg", -- Tangut
"zh", -- Chinese (all varieties with Chinese characters)
"zkt", -- Khitan
-------- Languages with spaces between syllables --------
"ahk", -- Akha
"aou", -- A'ou
"atb", -- Zaiwa
"byk", -- Biao
"cdy", -- Chadong
--"duu", -- Drung; not sure
--"hmx-pro", -- Proto-Hmong-Mien
--"hnj", -- Green Hmong; not sure
"huq", -- Tsat
"ium", -- Iu Mien
--"lis", -- Lisu; not sure
"mtq", -- Muong
--"mww", -- White Hmong; not sure
"onb", -- Lingao
--"sit-gkh", -- Gokhy; not sure
--"swi", -- Sui; not sure
"tbq-lol-pro", -- Proto-Loloish
"tdh", -- Thulung
"ukk", -- Muak Sa-aak
"vi", -- Vietnamese
"yig", -- Wusa Nasu
"zng", -- Mang
-------- Languages with ~ with surrounding spaces used to separate variants --------
"mkh-ban-pro", -- Proto-Bahnaric
"sit-pro", -- Proto-Sino-Tibetan; listed above
-------- Other weirdnesses --------
"mul", -- Translingual; gestures, Morse code, etc.
"aot", -- Atong (India); bullet is a letter
-------- All sign languages --------
"ads",
"aed",
"aen",
"afg",
"ase",
"asf",
"asp",
"asq",
"asw",
"bfi",
"bfk",
"bog",
"bqn",
"bqy",
"bvl",
"bzs",
"cds",
"csc",
"csd",
"cse",
"csf",
"csg",
"csl",
"csn",
"csq",
"csr",
"doq",
"dse",
"dsl",
"ecs",
"esl",
"esn",
"eso",
"eth",
"fcs",
"fse",
"fsl",
"fss",
"gds",
"gse",
"gsg",
"gsm",
"gss",
"gus",
"hab",
"haf",
"hds",
"hks",
"hos",
"hps",
"hsh",
"hsl",
"icl",
"iks",
"ils",
"inl",
"ins",
"ise",
"isg",
"isr",
"jcs",
"jhs",
"jls",
"jos",
"jsl",
"jus",
"kgi",
"kvk",
"lbs",
"lls",
"lsl",
"lso",
"lsp",
"lst",
"lsy",
"lws",
"mdl",
"mfs",
"mre",
"msd",
"msr",
"mzc",
"mzg",
"mzy",
"nbs",
"ncs",
"nsi",
"nsl",
"nsp",
"nsr",
"nzs",
"okl",
"pgz",
"pks",
"prl",
"prz",
"psc",
"psd",
"psg",
"psl",
"pso",
"psp",
"psr",
"pys",
"rms",
"rsl",
"rsm",
"sdl",
"sfb",
"sfs",
"sgg",
"sgx",
"slf",
"sls",
"sqk",
"sqs",
"ssp",
"ssr",
"svk",
"swl",
"syy",
"tse",
"tsm",
"tsq",
"tss",
"tsy",
"tza",
"ugn",
"ugy",
"ukl",
"uks",
"vgt",
"vsi",
"vsl",
"vsv",
"xki",
"xml",
"xms",
"ygs",
"ysl",
"zib",
"zsl",
}
-- In these languages, the hyphen is not considered a word separator for the "multiword terms" category.
data.hyphen_not_multiword_sep = {
"akk", -- Akkadian; hyphens between syllables
"akl", -- Aklanon; hyphens for mid-word glottal stops
"ber-pro", -- Proto-Berber; morphemes separated by hyphens
"ceb", -- Cebuano; hyphens for mid-word glottal stops
"cnk", -- Khumi Chin; hyphens used in single words
"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
"de", -- too many false positives
"esx-esk-pro", -- hyphen used to separate morphemes
"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
"hil", -- Hiligaynon; hyphens for mid-word glottal stops
"ilo", -- Ilocano; hyphens for mid-word glottal stops
"lcp", -- Western Lawa; dash as syllable joiner
"lwl", -- Eastern Lawa; dash as syllable joiner
"mfa", -- Pattani Malay in Thai script; dash as syllable joiner
"mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens
"msb", -- Masbatenyo; too many false positives
"tl", -- Tagalog; too many false positives
"war", -- Waray-Waray; too many false positives
"yo", -- Yoruba; hyphens used to show lengthened nasal vowels
}
-- These languages will not have "LANG masculine nouns" and similar categories added.
data.no_gender_cat = {
-- Languages without gender but which use the gender field for other purposes
"ja",
"th",
}
data.notranslit = {
"ams",
"az",
"bbc",
"bug",
"cia",
"cjm",
"cmn",
"cpi",
"hak",
--"ja",
"kzg",
"lad",
"lzh",
"ms",
"mul",
"mvi",
"nan",
"nan-hbl",
"nan-hnm",
"nan-luh",
"nan-tws",
"oj",
"okn",
"ryn",
"rys",
"ryu",
"sh",
"tgt",
--"th",
"tkn",
"tly",
"txg",
"und",
--"vi",
"xug",
"yoi",
"yox",
"yue",
"za",
"zh",
-- ภาษาที่ใช้ Thai อย่างเดียว หรือ Thai/Latin นอกนั้นไปกำหนดใน languages
"th", --ไทย
"bzi", --บีซู
"cbn", --ญัฮกุร
"lcp", --เลอเวือะตะวันตก
"lwl", --เลอเวือะตะวันออก
"nyw", --ญ้อ
"skb", --แสก
"sou", --ปักษ์ใต้
"thm", --ทะวืง
"tts", --อีสาน
"ugo", --กฺ๋อง
"urk", --อูรักลาโวยจ
}
-- เพิ่มเอง
data.forcetranslit = {
"eo",
"vi",
}
-- Script codes for which a script-tagged display title will be added.
data.toBeTagged = {
"Ahom",
"Arab",
"fa-Arab",
"glk-Arab",
"kk-Arab",
"ks-Arab",
"ku-Arab",
"mzn-Arab",
"ms-Arab",
"ota-Arab",
"pa-Arab",
"ps-Arab",
"sd-Arab",
"tt-Arab",
"ug-Arab",
"ur-Arab",
"Armi",
"Armn",
"Avst",
"Bali",
"Bamu",
"Batk",
"Beng",
"as-Beng",
"Bopo",
"Brah",
"Brai",
"Bugi",
"Buhd",
"Cakm",
"Cans",
"Cari",
"Cham",
"Cher",
"Copt",
"Cprt",
"Cyrl",
"Cyrs",
"Deva",
"Dsrt",
"Egyd",
"Egyp",
"Ethi",
"Geok",
"Geor",
"Glag",
"Goth",
"Grek",
"Polyt",
"polytonic",
"Gujr",
"Guru",
"Hang",
"Hani",
"Hano",
"Hebr",
"Hira",
"Hluw",
"Ital",
"Java",
"Kali",
"Kana",
"Khar",
"Khmr",
"Knda",
"Kthi",
"Lana",
"Laoo",
"Latn",
"Latf",
"Latg",
"Latnx",
"Latinx",
"pjt-Latn",
"Lepc",
"Limb",
"Linb",
"Lisu",
"Lyci",
"Lydi",
"Mand",
"Mani",
"Marc",
"Merc",
"Mero",
"Mlym",
"Mong",
"mnc-Mong",
"sjo-Mong",
"xwo-Mong",
"Mtei",
"Mymr",
"Narb",
"Nkoo",
"Ogam",
"Olck",
"Orkh",
"Orya",
"Osma",
"Ougr",
"Palm",
"Phag",
"Phli",
"Phlv",
"Phnx",
"Plrd",
"Prti",
"Rjng",
"Runr",
"Samr",
"Sarb",
"Saur",
"Sgnw",
"Shaw",
"Shrd",
"Sinh",
"Sora",
"Sund",
"Sylo",
"Syrc",
"Tagb",
"Tale",
"Talu",
"Taml",
"Tang",
"Tavt",
"Telu",
"Tfng",
"Tglg",
"Thaa",
"Thai",
"Tibt",
"Ugar",
"Vaii",
"Xpeo",
"Xsux",
"Yiii",
"Zmth",
"Zsym",
"Ipach",
"IPAchar",
"Music",
"musical",
"Rumin",
"Ruminumerals",
}
-- Parts of speech which will not be categorised in categories like "English terms spelled with É" if
-- the term is the character in question (e.g. the letter entry for English [[é]]). This contrasts with
-- entries like the French adjective [[m̂]], which is a one-letter word spelled with the letter.
data.pos_not_spelled_with_self = {
"diacritical marks",
"อักษรจีน", --Han characters
"ฮั้นถื่อ", --Han tu
"ฮันจา", --hanja
"ฮั่นจื้อ", --hanzi
"คันจิ", --kanji
"ตัวอักษร", --letters
"ตัวอักษรควบ", --ligatures
"logograms",
"ตัวเลข", --numeral symbols
"เลข", --numerals
"สัญลักษณ์", --symbols
}
-- Convert lists into sets.
for key, list in pairs(data) do
data[key] = {}
for _, item in ipairs(list) do
data[key][item] = true
end
end
------ 2. Lists that will not be converted into sets. ------
-- Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not
-- pluralized) form. It is singular so that the same table can be used in [[Module:form of]] for the p=/POS= param
-- and [[Module:links]] for the pos= param.
data.pos_aliases = {
a = "adjective",
adj = "adjective",
adv = "adverb",
art = "article",
det = "determiner",
cnum = "cardinal number",
conj = "conjunction",
conv = "converb",
int = "interjection",
interj = "interjection",
intj = "interjection",
n = "noun",
num = "numeral",
part = "participle",
pcl = "particle",
phr = "phrase",
pn = "proper noun",
postp = "postposition",
pre = "preposition",
prep = "preposition",
pro = "pronoun",
pron = "pronoun",
prop = "proper noun",
proper = "proper noun",
onum = "ordinal number",
v = "verb",
vb = "verb",
vi = "intransitive verb",
vt = "transitive verb",
vti = "transitive and intransitive verb",
}
-- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs"
-- will be generated if the headword is of the appropriate gender/number.
data.pos_for_gender_number_cat = {
["nouns"] = "nouns",
["proper nouns"] = "nouns",
["suffixes"] = "suffixes",
-- We include verbs because impf and pf are valid "genders".
["verbs"] = "verbs",
}
-- Convert a numeric list of characters and ranges to the equivalent Lua pattern. WARNING: This destructively modifies
-- the contents of `ranges`.
local function char_ranges_to_pattern(ranges)
for j, range in ipairs(ranges) do
if type(range) == "table" then
for k, char in ipairs(range) do
range[k] = u(char)
end
ranges[j] = table.concat(range, "-")
else
ranges[j] = u(range)
end
end
return table.concat(ranges)
end
-- Combining character data used when categorising unusual characters. These resolve into two patterns, used to find single combining characters (i.e. character + diacritic(s)) or double combining characters (i.e. character + diacritic(s) + character).
local comb_chars = {
single = {
{0x0300, 0x034E},
-- Exclude combining grapheme joiner.
{0x0350, 0x035B},
{0x0363, 0x036F},
{0x0483, 0x0489},
{0x0591, 0x05BD},
0x05BF,
{0x05C1, 0x05C2},
{0x05C4, 0x05C5},
0x05C7,
{0x0610, 0x061A},
{0x064B, 0x065F},
0x0670,
{0x06D6, 0x06DC},
{0x06DF, 0x06E4},
{0x06E7, 0x06E8},
{0x06EA, 0x06ED},
0x0711,
{0x0730, 0x074A},
{0x07A6, 0x07B0},
{0x07EB, 0x07F3},
0x07FD,
{0x0816, 0x0819},
{0x081B, 0x0823},
{0x0825, 0x0827},
{0x0829, 0x082D},
{0x0859, 0x085B},
{0x0898, 0x089F},
{0x08CA, 0x08E1},
{0x08E3, 0x0903},
{0x093A, 0x093C},
{0x093E, 0x094F},
{0x0951, 0x0957},
{0x0962, 0x0963},
{0x0981, 0x0983},
0x09BC,
{0x09BE, 0x09C4},
{0x09C7, 0x09C8},
{0x09CB, 0x09CD},
0x09D7,
{0x09E2, 0x09E3},
0x09FE,
{0x0A01, 0x0A03},
0x0A3C,
{0x0A3E, 0x0A42},
{0x0A47, 0x0A48},
{0x0A4B, 0x0A4D},
0x0A51,
{0x0A70, 0x0A71},
0x0A75,
{0x0A81, 0x0A83},
0x0ABC,
{0x0ABE, 0x0AC5},
{0x0AC7, 0x0AC9},
{0x0ACB, 0x0ACD},
{0x0AE2, 0x0AE3},
{0x0AFA, 0x0AFF},
{0x0B01, 0x0B03},
0x0B3C,
{0x0B3E, 0x0B44},
{0x0B47, 0x0B48},
{0x0B4B, 0x0B4D},
{0x0B55, 0x0B57},
{0x0B62, 0x0B63},
0x0B82,
{0x0BBE, 0x0BC2},
{0x0BC6, 0x0BC8},
{0x0BCA, 0x0BCD},
0x0BD7,
{0x0C00, 0x0C04},
0x0C3C,
{0x0C3E, 0x0C44},
{0x0C46, 0x0C48},
{0x0C4A, 0x0C4D},
{0x0C55, 0x0C56},
{0x0C62, 0x0C63},
{0x0C81, 0x0C83},
0x0CBC,
{0x0CBE, 0x0CC4},
{0x0CC6, 0x0CC8},
{0x0CCA, 0x0CCD},
{0x0CD5, 0x0CD6},
{0x0CE2, 0x0CE3},
0x0CF3,
{0x0D00, 0x0D03},
{0x0D3B, 0x0D3C},
{0x0D3E, 0x0D44},
{0x0D46, 0x0D48},
{0x0D4A, 0x0D4D},
0x0D57,
{0x0D62, 0x0D63},
{0x0D81, 0x0D83},
0x0DCA,
{0x0DCF, 0x0DD4},
0x0DD6,
{0x0DD8, 0x0DDF},
{0x0DF2, 0x0DF3},
0x0E31,
{0x0E34, 0x0E3A},
{0x0E47, 0x0E4E},
0x0EB1,
{0x0EB4, 0x0EBC},
{0x0EC8, 0x0ECE},
{0x0F18, 0x0F19},
0x0F35,
0x0F37,
0x0F39,
{0x0F3E, 0x0F3F},
{0x0F71, 0x0F84},
{0x0F86, 0x0F87},
{0x0F8D, 0x0F97},
{0x0F99, 0x0FBC},
0x0FC6,
{0x102B, 0x103E},
{0x1056, 0x1059},
{0x105E, 0x1060},
{0x1062, 0x1064},
{0x1067, 0x106D},
{0x1071, 0x1074},
{0x1082, 0x108D},
0x108F,
{0x109A, 0x109D},
{0x135D, 0x135F},
{0x1712, 0x1715},
{0x1732, 0x1734},
{0x1752, 0x1753},
{0x1772, 0x1773},
{0x17B4, 0x17D3},
0x17DD,
-- Exclude Mongolian variation selectors.
{0x1885, 0x1886},
0x18A9,
{0x1920, 0x192B},
{0x1930, 0x193B},
{0x1A17, 0x1A1B},
{0x1A55, 0x1A5E},
{0x1A60, 0x1A7C},
0x1A7F,
{0x1AB0, 0x1ACE},
{0x1B00, 0x1B04},
{0x1B34, 0x1B44},
{0x1B6B, 0x1B73},
{0x1B80, 0x1B82},
{0x1BA1, 0x1BAD},
{0x1BE6, 0x1BF3},
{0x1C24, 0x1C37},
{0x1CD0, 0x1CD2},
{0x1CD4, 0x1CE8},
0x1CED,
0x1CF4,
{0x1CF7, 0x1CF9},
{0x1DC0, 0x1DCC},
{0x1DCE, 0x1DFB},
{0x1DFD, 0x1DFF},
{0x20D0, 0x20F0},
{0x2CEF, 0x2CF1},
0x2D7F,
{0x2DE0, 0x2DFF},
{0x302A, 0x302F},
{0x3099, 0x309A},
{0xA66F, 0xA672},
{0xA674, 0xA67D},
{0xA69E, 0xA69F},
{0xA6F0, 0xA6F1},
0xA802,
0xA806,
0xA80B,
{0xA823, 0xA827},
0xA82C,
{0xA880, 0xA881},
{0xA8B4, 0xA8C5},
{0xA8E0, 0xA8F1},
0xA8FF,
{0xA926, 0xA92D},
{0xA947, 0xA953},
{0xA980, 0xA983},
{0xA9B3, 0xA9C0},
0xA9E5,
{0xAA29, 0xAA36},
0xAA43,
{0xAA4C, 0xAA4D},
{0xAA7B, 0xAA7D},
0xAAB0,
{0xAAB2, 0xAAB4},
{0xAAB7, 0xAAB8},
{0xAABE, 0xAABF},
0xAAC1,
{0xAAEB, 0xAAEF},
{0xAAF5, 0xAAF6},
{0xABE3, 0xABEA},
{0xABEC, 0xABED},
0xFB1E,
{0xFE20, 0xFE2F},
0x101FD,
0x102E0,
{0x10376, 0x1037A},
{0x10A01, 0x10A03},
{0x10A05, 0x10A06},
{0x10A0C, 0x10A0F},
{0x10A38, 0x10A3A},
0x10A3F,
{0x10AE5, 0x10AE6},
{0x10D24, 0x10D27},
{0x10EAB, 0x10EAC},
{0x10EFD, 0x10EFF},
{0x10F46, 0x10F50},
{0x10F82, 0x10F85},
{0x11000, 0x11002},
{0x11038, 0x11046},
0x11070,
{0x11073, 0x11074},
{0x1107F, 0x11082},
{0x110B0, 0x110BA},
0x110C2,
{0x11100, 0x11102},
{0x11127, 0x11134},
{0x11145, 0x11146},
0x11173,
{0x11180, 0x11182},
{0x111B3, 0x111C0},
{0x111C9, 0x111CC},
{0x111CE, 0x111CF},
{0x1122C, 0x11237},
0x1123E,
0x11241,
{0x112DF, 0x112EA},
{0x11300, 0x11303},
{0x1133B, 0x1133C},
{0x1133E, 0x11344},
{0x11347, 0x11348},
{0x1134B, 0x1134D},
0x11357,
{0x11362, 0x11363},
{0x11366, 0x1136C},
{0x11370, 0x11374},
{0x11435, 0x11446},
0x1145E,
{0x114B0, 0x114C3},
{0x115AF, 0x115B5},
{0x115B8, 0x115C0},
{0x115DC, 0x115DD},
{0x11630, 0x11640},
{0x116AB, 0x116B7},
{0x1171D, 0x1172B},
{0x1182C, 0x1183A},
{0x11930, 0x11935},
{0x11937, 0x11938},
{0x1193B, 0x1193E},
0x11940,
{0x11942, 0x11943},
{0x119D1, 0x119D7},
{0x119DA, 0x119E0},
0x119E4,
{0x11A01, 0x11A0A},
{0x11A33, 0x11A39},
{0x11A3B, 0x11A3E},
0x11A47,
{0x11A51, 0x11A5B},
{0x11A8A, 0x11A99},
{0x11C2F, 0x11C36},
{0x11C38, 0x11C3F},
{0x11C92, 0x11CA7},
{0x11CA9, 0x11CB6},
{0x11D31, 0x11D36},
0x11D3A,
{0x11D3C, 0x11D3D},
{0x11D3F, 0x11D45},
0x11D47,
{0x11D8A, 0x11D8E},
{0x11D90, 0x11D91},
{0x11D93, 0x11D97},
{0x11EF3, 0x11EF6},
{0x11F00, 0x11F01},
0x11F03,
{0x11F34, 0x11F3A},
{0x11F3E, 0x11F42},
0x13440,
{0x13447, 0x13455},
{0x16AF0, 0x16AF4},
{0x16B30, 0x16B36},
0x16F4F,
{0x16F51, 0x16F87},
{0x16F8F, 0x16F92},
-- Exclude Khitan Small Script filler.
{0x16FF0, 0x16FF1},
{0x1BC9D, 0x1BC9E},
{0x1CF00, 0x1CF2D},
{0x1CF30, 0x1CF46},
{0x1D165, 0x1D169},
{0x1D16D, 0x1D172},
{0x1D17B, 0x1D182},
{0x1D185, 0x1D18B},
{0x1D1AA, 0x1D1AD},
{0x1D242, 0x1D244},
{0x1DA00, 0x1DA36},
{0x1DA3B, 0x1DA6C},
0x1DA75,
0x1DA84,
{0x1DA9B, 0x1DA9F},
{0x1DAA1, 0x1DAAF},
{0x1E000, 0x1E006},
{0x1E008, 0x1E018},
{0x1E01B, 0x1E021},
{0x1E023, 0x1E024},
{0x1E026, 0x1E02A},
0x1E08F,
{0x1E130, 0x1E136},
0x1E2AE,
{0x1E2EC, 0x1E2EF},
{0x1E4EC, 0x1E4EF},
{0x1E8D0, 0x1E8D6},
{0x1E944, 0x1E94A},
},
double = {
{0x035C, 0x0362},
0x1DCD,
0x1DFC,
},
vs = { -- variation selectors; separated out so that we don't get categories for them
{0xFE00, 0xFE0F},
{0xE0100, 0xE01EF},
}
}
for key, set in pairs(comb_chars) do
comb_chars[key] = char_ranges_to_pattern(set)
end
comb_chars.both = comb_chars.single .. comb_chars.double .. comb_chars.vs
comb_chars = {
combined_single = "[^" .. comb_chars.both .. "][" .. comb_chars.single .. comb_chars.vs .. "]+%f[^" .. comb_chars.both .. "]",
combined_double = "[^" .. comb_chars.both .. "][" .. comb_chars.single .. comb_chars.vs .. "]*[" .. comb_chars.double .. "]+[" .. comb_chars.both .. "]*.[" .. comb_chars.single .. comb_chars.vs .. "]*",
diacritics_single = "[" .. comb_chars.single .. "]",
diacritics_double = "[" .. comb_chars.double .. "]"
}
data.comb_chars = comb_chars
-- From https://unicode.org/Public/emoji/15.1/emoji-sequences.txt
local emoji_chars = {
{0x231A, 0x231B}, -- watch..hourglass done # E0.6 [2] (⌚..⌛)
{0x23E9, 0x23EC}, -- fast-forward button..fast down button # E0.6 [4] (⏩..⏬)
0x23F0, -- alarm clock # E0.6 [1] (⏰)
0x23F3, -- hourglass not done # E0.6 [1] (⏳)
{0x25FD, 0x25FE}, -- white medium-small square..black medium-small square # E0.6 [2] (◽..◾)
{0x2614, 0x2615}, -- umbrella with rain drops..hot beverage # E0.6 [2] (☔..☕)
{0x2648, 0x2653}, -- Aries..Pisces # E0.6 [12] (♈..♓)
0x267F, -- wheelchair symbol # E0.6 [1] (♿)
0x2693, -- anchor # E0.6 [1] (⚓)
0x26A1, -- high voltage # E0.6 [1] (⚡)
{0x26AA, 0x26AB}, -- white circle..black circle # E0.6 [2] (⚪..⚫)
{0x26BD, 0x26BE}, -- soccer ball..baseball # E0.6 [2] (⚽..⚾)
{0x26C4, 0x26C5}, -- snowman without snow..sun behind cloud # E0.6 [2] (⛄..⛅)
0x26CE, -- Ophiuchus # E0.6 [1] (⛎)
0x26D4, -- no entry # E0.6 [1] (⛔)
0x26EA, -- church # E0.6 [1] (⛪)
{0x26F2, 0x26F3}, -- fountain..flag in hole # E0.6 [2] (⛲..⛳)
0x26F5, -- sailboat # E0.6 [1] (⛵)
0x26FA, -- tent # E0.6 [1] (⛺)
0x26FD, -- fuel pump # E0.6 [1] (⛽)
0x2705, -- check mark button # E0.6 [1] (✅)
{0x270A, 0x270B}, -- raised fist..raised hand # E0.6 [2] (✊..✋)
0x2728, -- sparkles # E0.6 [1] (✨)
0x274C, -- cross mark # E0.6 [1] (❌)
0x274E, -- cross mark button # E0.6 [1] (❎)
{0x2753, 0x2755}, -- red question mark..white exclamation mark # E0.6 [3] (❓..❕)
0x2757, -- red exclamation mark # E0.6 [1] (❗)
{0x2795, 0x2797}, -- plus..divide # E0.6 [3] (➕..➗)
0x27B0, -- curly loop # E0.6 [1] (➰)
0x27BF, -- double curly loop # E1.0 [1] (➿)
{0x2B1B, 0x2B1C}, -- black large square..white large square # E0.6 [2] (⬛..⬜)
0x2B50, -- star # E0.6 [1] (⭐)
0x2B55, -- hollow red circle # E0.6 [1] (⭕)
{0x1F300, 0x1FAFF}, -- emoji in Plane 1
-- NOTE: There are lots more emoji sequences involving non-emoji Plane 0 symbols followed by 0xFE0F, which we don't
-- (yet?) handle.
}
emoji_chars = char_ranges_to_pattern(emoji_chars)
data.emoji_pattern = "[" .. emoji_chars .. "]"
local unsupported_characters = {}
for k, v in pairs(require("Module:links/data").unsupported_characters) do
unsupported_characters[v] = k
end
-- Get the list of unsupported titles and invert it (so the keys are pagenames and values are canonical titles).
local unsupported_titles = {}
for k, v in pairs(require("Module:links/data").unsupported_titles) do
unsupported_titles[v] = k
end
data.unsupported_titles = unsupported_titles
------ 3. Page-wide processing (so that it only needs to be done once per page). ------
--Get the pagename.
local pagename = title.subpageText
:gsub("^ชื่อไม่รองรับ/(.*)", function(m)
data.unsupported_title = true
return unsupported_titles[m] or (m:gsub("`.-`", unsupported_characters))
end)
-- Save pagename, as local variable will be destructively modified.
data.pagename = pagename
-- Decompose the pagename in Unicode normalization form D.
data.decompose_pagename = mw.ustring.toNFD(pagename)
-- Explode the current page name into a character table, taking decomposed combining characters into account.
local explode_pagename = {}
local pagename_len = 0
local function explode(char)
explode_pagename[char] = true
pagename_len = pagename_len + 1
return ""
end
pagename = gsub(pagename, comb_chars.combined_double, explode)
pagename = gsub(pagename, comb_chars.combined_single, explode)
:gsub("[%z\1-\127\194-\244][\128-\191]*", explode)
data.explode_pagename = explode_pagename
data.pagename_len = pagename_len
-- Generate DEFAULTSORT.
data.encoded_pagename = mw.text.encode(data.pagename)
data.pagename_defaultsort = require("Module:languages").getByCode("mul"):makeSortKey(data.encoded_pagename)
frame:callParserFunction(
"DEFAULTSORT",
data.pagename_defaultsort
)
data.raw_defaultsort = title.text:uupper()
-- Get section numbers for the page.
do
local page_L2s = {}
local i = 0
for lvl, heading in content:gmatch("%f[^%z\n](=+)([^\n\r]+)%1[\t ]*%f[%z\n]") do
i = i + 1
if #lvl == 2 then
page_L2s[i] = trim(heading)
end
end
data.page_L2s = page_L2s
end
------ 4. Parse page for maintenance categories. ------
content = content:gsub("%[%[", "\1"):gsub("]]", "\2")
-- Use of tab characters.
if content:find("\t") then
data.tab_characters = frame:expandTemplate{
title = "tracking category",
args = {"Pages with tab characters"}
}
end
-- Unencoded character(s) in title.
local IDS = {
["⿰"] = true, ["⿱"] = true, ["⿲"] = true, ["⿳"] = true,
["⿴"] = true, ["⿵"] = true, ["⿶"] = true, ["⿷"] = true,
["⿸"] = true, ["⿹"] = true, ["⿺"] = true, ["⿻"] = true,
[""] = true, [""] = true, [""] = true, [""] = true,
[""] = true
}
for char in pairs(explode_pagename) do
if IDS[char] and char ~= data.pagename then
data.unencoded_char = true
break
end
end
-- Raw wikitext use of {{DISPLAYTITLE:}}.
if content:find("{{%s*DISPLAYTITLE:.-}}") then
data.pagename_displaytitle_conflict = frame:expandTemplate{
title = "tracking category",
args = {"Pages with DISPLAYTITLE conflicts"}
}
end
-- Raw wikitext use of a topic or langname category. Also check if any raw sortkeys have been used.
do
-- All chars treated as spaces in links (including categories).
local spaces = " _" ..
"\194\160" ..
"\225\154\128" ..
"\225\160\142" ..
"\226\128\128-\226\128\138" ..
"\226\128\168" ..
"\226\128\169" ..
"\226\128\175" ..
"\226\129\159" ..
"\227\128\128"
local wikitext_topic_cat = {}
local wikitext_langname_cat = {}
local raw_sortkey
local langnames = mw.loadData("Module:languages/canonical names")
local etym_langnames = mw.loadData("Module:etymology languages/canonical names")
-- If a raw sortkey has been found, add it to the relevant table.
-- If there's no table (or the index is just `true`), create one first.
local function add_cat_table(marker, sortkey, tbl)
if not sortkey then
tbl[marker] = tbl[marker] or true
return true
elseif type(tbl[marker]) ~= "table" then
tbl[marker] = {}
end
insert(tbl[marker], sortkey)
return true
end
local function do_iteration(name, sortkey, wikitext_langname_cat)
if langnames[name] then
return add_cat_table(name, sortkey, wikitext_langname_cat)
end
name = etym_langnames[name] and name or content_lang:lcfirst(name)
if etym_langnames[name] then
name = get_etym_lang(name):getNonEtymologicalName()
return add_cat_table(name, sortkey, wikitext_langname_cat)
end
end
local function process_category(cat)
cat = trim(cat, spaces)
local code = cat:match("^([%w%-.]+):")
local sortkey = cat:match("|(.*)")
if sortkey then
raw_sortkey = raw_sortkey or frame:expandTemplate{
title = "tracking category",
args = {"Pages with raw sortkeys"}
}
end
if code then
return add_cat_table(code, sortkey, wikitext_topic_cat)
end
-- Remove sortkey and split by word.
cat = split(cat:gsub("|.*", ""), "[" .. spaces .. "]+")
-- Iterate over the category name, starting with the longest possible name and shaving off the first word until we find one. We do it this way because:
-- (a) Going from shortest to longest risks falsely matching (e.g.) German Low German categories as German.
-- (b) Checking the start of category names first risks falsely match (e.g.) Alsatian French as Alsatian (a variety of Alemannic German), not French.
-- If no matches are found, then check the start of the category name, shaving off the last word each iteration.
local cat_len = #cat
local n, name, done = 1
repeat
name = concat(cat, " ", n, cat_len)
done = do_iteration(name, sortkey, wikitext_langname_cat)
if done then
return
end
n = n + 1
until n > cat_len
n = cat_len - 1
if n <= 0 then
return
end
repeat
name = concat(cat, " ", 1, n)
done = do_iteration(name, sortkey, wikitext_langname_cat)
if done then
return
end
n = n - 1
until n == 0
end
for prefix, cat in content:gmatch("\1([^\1\2]-[Cc][Aa][Tt][^\1\2]-):([^\1]-)\2") do
prefix = trim(prefix, spaces):lower()
if prefix == "cat" or prefix == "category" then
process_category(cat)
end
end
data.wikitext_topic_cat = wikitext_topic_cat
data.wikitext_langname_cat = wikitext_langname_cat
data.raw_sortkey = raw_sortkey
end
return data