มอดูล:headword/data

จาก วิกิพจนานุกรม พจนานุกรมเสรี
local headword_page_module = "Module:headword/page"

local concat = table.concat
local get_etym_lang = require("Module:etymology languages").getByCanonicalName
local gsub = mw.ustring.gsub
local insert = table.insert
local set = require("Module:utilities/set")
local split = mw.text.split
local trim = mw.text.trim
local type_or_class = require("Module:parser").type_or_class
local u = require("Module:string/char")

local data = {}

------ 1. Lists that will be converted into sets. ------

data.invariable = set{
	"ชมาโว", --cmavo (Lojban)
	"ชเมเน", --cmene (Lojban)
	"ฟูฮิฝลา", --fu'ivla (Lojban)
	"กิสมู", --gismu (Lojban)
	"ฮั้นถื่อ", --Han tu
	"ฮั่นจื้อ", --hanzi
	"ฮันจา", --hanja
	"ยฺหวืดเพ็ง", --jyutping
	"คันจิ", --kanji
	"ลุฌโว", --lujvo (Lojban)
	"phrasebook",
	"พินอิน", --pinyin
	"รัฟซี", --rafsi (Lojban)
	"โรมาจิ", --romaji
}

data.lemmas = set{
	"คำย่อ", --abbreviations
	"acronyms",
	"คำคุณศัพท์", --adjectives
	"adnominals",
	"adpositions",
	"คำกริยาวิเศษณ์", --adverbs
	"affixes",
	"ambipositions",
	"คำกำกับนาม", --articles
	"circumfixes",
	"circumpositions",
	"คำลักษณนาม", --classifiers
	"ชมาโว", --cmavo
	"cmavo clusters",
	"ชเมเน", --cmene
	"combining forms",
	"คำสันธาน", --conjunctions
	"คำลักษณนาม", --counters = classifiers
	"คำกำหนด", --determiners
	"diacritical marks",
	"ทวิอักษร", --digraphs
	"equative adjectives",
	"ฟูฮิฝลา", --fu'ivla
	"กิสมู", --gismu
	"อักษรจีน", --Han characters
	"ฮั้นถื่อ", --Han tu
	"ฮั่นจื้อ", --hanzi
	"ฮันจา", --hanja
	"ideophones",
	"สำนวน", --idioms
	"อาคม", --infixes
	"อักษรย่อ", --initialisms
	"interfixes",
	"คำอุทาน", --interjections
	"คันจิ", --kanji
	"ตัวอักษร", --letters
	"ตัวอักษรควบ", --ligatures
	"logograms",
	"ลุฌโว", --lujvo
	"หน่วยคำ", --morphemes
	"non-constituents",
	"คำนาม", --nouns
	"จำนวน", --numbers
	"ตัวเลข", --numeral symbols
	"เลข", --numerals
	"คำอนุภาค", --particles
	"วลี", --phrases
	"คำปัจฉบท", --postpositions
	"postpositional phrases",
	"predicatives",
	"อุปสรรค", --prefixes
	"prepositional phrases",
	"คำบุพบท", --prepositions
	"preverbs",
	"pronominal adverbs",
	"คำสรรพนาม", --pronouns
	"คำวิสามานยนาม", --proper nouns
	"สุภาษิต", --proverbs
	"เครื่องหมายวรรคตอน", --punctuation marks
	"relatives",
	"ราก", --roots
	"stems",
	"ปัจจัย", --suffixes
	"พยางค์", --syllables
	"สัญลักษณ์", --symbols
	"คำกริยา", --verbs
}

data.nonlemmas = set{
	"รูปผันพาร์ทิซิเพิลกรรตุวาจก", --active participle forms
	"พาร์ทิซิเพิลกรรตุวาจก", --active participles
	"พาร์ทิซิเพิลเชิงคุณศัพท์", --adjectival participles
    "adjective case forms",
	"รูปผันคำคุณศัพท์", --adjective forms
	"adjective feminine forms",
	"รูปผันคำคุณศัพท์พหูพจน์", --adjective plural forms
	"รูปผันคำกริยาวิเศษณ์", --adverb forms
	"พาร์ทิซิเพิลเชิงกริยาวิเศษณ์", --adverbial participles
	"agent participles",
	"รูปผันคำกำกับนาม", --article forms
	"circumfix forms",
	"combined forms",
	"รูปผันคำคุณศัพท์ขั้นกว่า", --comparative adjective forms
	"คำคุณศัพท์ขั้นกว่า", --comparative adjectives
	"รูปผันคำกริยาวิเศษณ์ขั้นกว่า", --comparative adverb forms
	"คำกริยาวิเศษณ์ขั้นกว่า", --comparative adverbs
	"รูปผันคำสันธาน", --conjunction forms
	"contractions",
	"converbs",
	"รูปผันคำกำหนดขั้นกว่า", --determiner comparative forms
	"รูปผันคำกำหนด", --determiner forms
	"รูปผันคำกำหนดขั้นสุด", --determiner superlative forms
	"คำนามบอกความเล็ก", --diminutive nouns
	"elative adjectives",
	"equative adjective forms",
	"equative adjectives",
	"พาร์ทิซิเพิลอนาคตกาล", --future participles
	"gerunds",
	"infinitive forms",
	"infinitives",
	"รูปผันคำอุทาน", --interjection forms
	"ยฺหวืดเพ็ง", --jyutping
	"kanji readings",
	"misspellings",
	"negative participles",
	"nominal participles",
	"noun case forms",
	"รูปผันคำนามทวิพจน์", --noun dual forms
	"รูปผันคำนาม", --noun forms
	"noun paucal forms",
	"รูปผันคำนามพหูพจน์", --noun plural forms
	"noun possessive forms",
	"noun singulative forms",
	"รูปผันเลข", --numeral forms
	"พาร์ทิซิเพิล", --participles
	"รูปผันพาร์ทิซิเพิล", --participle forms
	"รูปผันคำอนุภาค", --particle forms
	"พาร์ทิซิเพิลกรรมวาจก", --passive participles
	"พาร์ทิซิเพิลกรรตุวาจกอดีตกาล", --past active participles
	"พาร์ทิซิเพิลอดีตกาล", --past participles
	"รูปผันพาร์ทิซิเพิลอดีตกาล", --past participle forms
	"พาร์ทิซิเพิลกรรมวาจกอดีตกาล", --past passive participles
	"perfect active participles",
	"perfect participles",
	"perfect passive participles",
	"พินอิน", --pinyin
	"พหูพจน์", --plurals
	"รูปผันคำปัจฉบท", --postposition forms
	"รูปผันอุปสรรค", --prefix forms
	"preposition contractions",
	"รูปผันคำบุพบท", --preposition forms
	"prepositional pronouns",
	"พาร์ทิซิเพิลกรรตุวาจกปัจจุบันกาล", --present active participles
	"พาร์ทิซิเพิลปัจจุบันกาล", --present participles
	"พาร์ทิซิเพิลกรรมวาจกปัจจุบันกาล", --present passive participles
	"รูปผันคำสรรพนาม", --pronoun forms
	"pronoun possessive forms",
	"รูปผันคำวิสามานยนาม", --proper noun forms
	"รูปผันคำวิสามานยนามพหูพจน์", --proper noun plural forms
	"รัฟซี", --rafsi
	"การถอดเป็นอักษรโรมัน", --romanizations
	"root forms",
	"singulatives",
	"รูปผันปัจจัย", --suffix forms
	"รูปผันคำคุณศัพท์ขั้นสุด", --superlative adjective forms
	"คำคุณศัพท์ขั้นสุด", --superlative adjectives
	"รูปผันคำกริยาวิเศษณ์ขั้นสุด", --superlative adverb forms
	"คำกริยาวิเศษณ์ขั้นสุด", --superlative adverbs
	"รูปผันคำกริยา", --verb forms
	"verbal nouns",
}

-- These langauges will not have links to separate parts of the headword.
data.no_multiword_links = set{
	"zh",
}

-- These languages will not have "LANG multiword terms" categories added.
data.no_multiword_cat = set{
	-------- Languages without spaces between words (sometimes spaces between phrases) --------
	"blt", -- Tai Dam
	"ja", -- Japanese
	"khb", -- Lü
	"km", -- Khmer
	"lo", -- Lao
	"mnw", -- Mon
	"my", -- Burmese
	"nan", -- Min Nan (some words in Latin script; hyphens between syllables)
	"nan-hbl", -- Hokkien (some words in Latin script; hyphens between syllables)
	"nod", -- Northern Thai
	"ojp", -- Old Japanese
	"shn", -- Shan
	"sou", -- Southern Thai
	"tdd", -- Tai Nüa
	"th", -- Thai
	"tts", -- Isan
	"twh", -- Tai Dón
	"txg", -- Tangut
	"zh", -- Chinese (all varieties with Chinese characters)
	"zkt", -- Khitan

	-------- Languages with spaces between syllables --------
	"ahk", -- Akha
	"aou", -- A'ou
	"atb", -- Zaiwa
	"byk", -- Biao
	"cdy", -- Chadong
	--"duu", -- Drung; not sure
	--"hmx-pro", -- Proto-Hmong-Mien
	--"hnj", -- Green Hmong; not sure
	"huq", -- Tsat
	"ium", -- Iu Mien
	--"lis", -- Lisu; not sure
	"mtq", -- Muong
	--"mww", -- White Hmong; not sure
	"onb", -- Lingao
	--"sit-gkh", -- Gokhy; not sure
	--"swi", -- Sui; not sure
	"tbq-lol-pro", -- Proto-Loloish
	"tdh", -- Thulung
	"ukk", -- Muak Sa-aak
	"vi", -- Vietnamese
	"yig", -- Wusa Nasu
	"zng", -- Mang

	-------- Languages with ~ with surrounding spaces used to separate variants --------
	"mkh-ban-pro", -- Proto-Bahnaric
	"sit-pro", -- Proto-Sino-Tibetan; listed above

	-------- Other weirdnesses --------
	"mul", -- Translingual; gestures, Morse code, etc.
	"aot", -- Atong (India); bullet is a letter

	-------- All sign languages	--------
	"ads",
	"aed",
	"aen",
	"afg",
	"ase",
	"asf",
	"asp",
	"asq",
	"asw",
	"bfi",
	"bfk",
	"bog",
	"bqn",
	"bqy",
	"bvl",
	"bzs",
	"cds",
	"csc",
	"csd",
	"cse",
	"csf",
	"csg",
	"csl",
	"csn",
	"csq",
	"csr",
	"doq",
	"dse",
	"dsl",
	"ecs",
	"esl",
	"esn",
	"eso",
	"eth",
	"fcs",
	"fse",
	"fsl",
	"fss",
	"gds",
	"gse",
	"gsg",
	"gsm",
	"gss",
	"gus",
	"hab",
	"haf",
	"hds",
	"hks",
	"hos",
	"hps",
	"hsh",
	"hsl",
	"icl",
	"iks",
	"ils",
	"inl",
	"ins",
	"ise",
	"isg",
	"isr",
	"jcs",
	"jhs",
	"jls",
	"jos",
	"jsl",
	"jus",
	"kgi",
	"kvk",
	"lbs",
	"lls",
	"lsl",
	"lso",
	"lsp",
	"lst",
	"lsy",
	"lws",
	"mdl",
	"mfs",
	"mre",
	"msd",
	"msr",
	"mzc",
	"mzg",
	"mzy",
	"nbs",
	"ncs",
	"nsi",
	"nsl",
	"nsp",
	"nsr",
	"nzs",
	"okl",
	"pgz",
	"pks",
	"prl",
	"prz",
	"psc",
	"psd",
	"psg",
	"psl",
	"pso",
	"psp",
	"psr",
	"pys",
	"rms",
	"rsl",
	"rsm",
	"sdl",
	"sfb",
	"sfs",
	"sgg",
	"sgx",
	"slf",
	"sls",
	"sqk",
	"sqs",
	"ssp",
	"ssr",
	"svk",
	"swl",
	"syy",
	"tse",
	"tsm",
	"tsq",
	"tss",
	"tsy",
	"tza",
	"ugn",
	"ugy",
	"ukl",
	"uks",
	"vgt",
	"vsi",
	"vsl",
	"vsv",
	"xki",
	"xml",
	"xms",
	"ygs",
	"ysl",
	"zib",
	"zsl",
}

-- In these languages, the hyphen is not considered a word separator for the "multiword terms" category.
data.hyphen_not_multiword_sep = set{
	"akk", -- Akkadian; hyphens between syllables
	"akl", -- Aklanon; hyphens for mid-word glottal stops
	"ber-pro", -- Proto-Berber; morphemes separated by hyphens
	"ceb", -- Cebuano; hyphens for mid-word glottal stops
	"cnk", -- Khumi Chin; hyphens used in single words
	"cpi", -- Chinese Pidgin English; Chinese-derived words with hyphens between syllables
	"de", -- too many false positives
	"esx-esk-pro", -- hyphen used to separate morphemes
	"fi", -- Finnish; hyphen used to separate components in compound words if the final and initial vowels match, respectively
	"hil", -- Hiligaynon; hyphens for mid-word glottal stops
	"ilo", -- Ilocano; hyphens for mid-word glottal stops
	"lcp", -- Western Lawa; dash as syllable joiner
	"lwl", -- Eastern Lawa; dash as syllable joiner
	"mfa", -- Pattani Malay in Thai script; dash as syllable joiner
	"mkh-vie-pro", -- Proto-Vietic; morphemes separated by hyphens
	"msb", -- Masbatenyo; too many false positives
	"tl", -- Tagalog; too many false positives
	"war", -- Waray-Waray; too many false positives
	"yo", -- Yoruba; hyphens used to show lengthened nasal vowels
}

-- These languages will not have "LANG masculine nouns" and similar categories added.
data.no_gender_cat = set{
	-- Languages without gender but which use the gender field for other purposes
	"ja",
	"th",
}

data.notranslit = set{
	"ams",
	"az",
	"bbc",
	"bug",
	"cia",
	"cjm",
	"cmn",
	"cpi",
	"hak",
	--"ja",
	"kzg",
	"lad",
	"lzh",
	"ms",
	"mul",
	"mvi",
	"nan",
	"nan-hbl",
	"nan-hnm",
	"nan-luh",
	"nan-tws",
	"oj",
	"okn",
	"ryn",
	"rys",
	"ryu",
	"sh",
	"tgt",
	--"th",
	"tkn",
	"tly",
	"txg",
	"und",
	--"vi",
	"xug",
	"yoi",
	"yox",
	"yue",
	"za",
	"zh",

	-- ภาษาที่ใช้ Thai อย่างเดียว หรือ Thai/Latin นอกนั้นไปกำหนดใน languages
	"th", --ไทย
	"bzi", --บีซู
	"cbn", --ญัฮกุร
	"lcp", --เลอเวือะตะวันตก
	"lwl", --เลอเวือะตะวันออก
	"nyw", --ญ้อ
	"skb", --แสก
	"sou", --ปักษ์ใต้
	"thm", --ทะวืง
	"tts", --อีสาน
	"ugo", --กฺ๋อง
	"urk", --อูรักลาโวยจ
}

-- เพิ่มเอง
data.forcetranslit = {
	"eo",
	"vi",
}

-- Script codes for which a script-tagged display title will be added.
data.toBeTagged = set{
	"Ahom",
	"Arab",
		"fa-Arab",
		"glk-Arab",
		"kk-Arab",
		"ks-Arab",
		"ku-Arab",
		"mzn-Arab",
		"ms-Arab",
		"ota-Arab",
		"pa-Arab",
		"ps-Arab",
		"sd-Arab",
		"tt-Arab",
		"ug-Arab",
		"ur-Arab",
	"Armi",
	"Armn",
	"Avst",
	"Bali",
	"Bamu",
	"Batk",
	"Beng",
		"as-Beng",
	"Bopo",
	"Brah",
	"Brai",
	"Bugi",
	"Buhd",
	"Cakm",
	"Cans",
	"Cari",
	"Cham",
	"Cher",
	"Copt",
	"Cprt",
	"Cyrl",
	"Cyrs",
	"Deva",
	"Dsrt",
	"Egyd",
	"Egyp",
	"Ethi",
	"Geok",
	"Geor",
	"Glag",
	"Goth",
	"Grek",
		"Polyt",
		"polytonic",
	"Gujr",
	"Guru",
	"Hang",
	"Hani",
	"Hano",
	"Hebr",
	"Hira",
	"Hluw",
	"Ital",
	"Java",
	"Kali",
	"Kana",
	"Khar",
	"Khmr",
	"Knda",
	"Kthi",
	"Lana",
	"Laoo",
	"Latn",
		"Latf",
		"Latg",
		"Latnx",
		"Latinx",
		"pjt-Latn",
	"Lepc",
	"Limb",
	"Linb",
	"Lisu",
	"Lyci",
	"Lydi",
	"Mand",
	"Mani",
	"Marc",
	"Merc",
	"Mero",
	"Mlym",
	"Mong",
		"mnc-Mong",
		"sjo-Mong",
		"xwo-Mong",
	"Mtei",
	"Mymr",
	"Narb",
	"Nkoo",
	"Ogam",
	"Olck",
	"Orkh",
	"Orya",
	"Osma",
	"Ougr",
	"Palm",
	"Phag",
	"Phli",
	"Phlv",
	"Phnx",
	"Plrd",
	"Prti",
	"Rjng",
	"Runr",
	"Samr",
	"Sarb",
	"Saur",
	"Sgnw",
	"Shaw",
	"Shrd",
	"Sinh",
	"Sora",
	"Sund",
	"Sylo",
	"Syrc",
	"Tagb",
	"Tale",
	"Talu",
	"Taml",
	"Tang",
	"Tavt",
	"Telu",
	"Tfng",
	"Tglg",
	"Thaa",
	"Thai",
	"Tibt",
	"Ugar",
	"Vaii",
	"Xpeo",
	"Xsux",
	"Yiii",
	"Zmth",
	"Zsym",

	"Ipach",
	"IPAchar",
	"Music",
	"musical",
	"Rumin",
	"Ruminumerals",
}

-- Parts of speech which will not be categorised in categories like "English terms spelled with É" if
-- the term is the character in question (e.g. the letter entry for English [[é]]). This contrasts with
-- entries like the French adjective [[m̂]], which is a one-letter word spelled with the letter.
data.pos_not_spelled_with_self = set{
	"diacritical marks",
	"อักษรจีน", --Han characters
	"ฮั้นถื่อ", --Han tu
	"ฮันจา", --hanja
	"ฮั่นจื้อ", --hanzi
	"คันจิ", --kanji
	"ตัวอักษร", --letters
	"ตัวอักษรควบ", --ligatures
	"logograms",
	"ตัวเลข", --numeral symbols
	"เลข", --numerals
	"สัญลักษณ์", --symbols
}

------ 2. Lists that will not be converted into sets. ------

-- Recognized aliases for parts of speech (param 2=). Key is the short form and value is the canonical singular (not
-- pluralized) form. It is singular so that the same table can be used in [[Module:form of]] for the p=/POS= param
-- and [[Module:links]] for the pos= param.
data.pos_aliases = {
	a = "adjective",
	adj = "adjective",
	adv = "adverb",
	art = "article",
	det = "determiner",
	cnum = "cardinal number",
	conj = "conjunction",
	conv = "converb",
	hanzi = "Han character",
	int = "interjection",
	interj = "interjection",
	intj = "interjection",
	n = "noun",
	num = "numeral",
	part = "participle",
	pcl = "particle",
	phr = "phrase",
	pn = "proper noun",
	postp = "postposition",
	pre = "preposition",
	prep = "preposition",
	pro = "pronoun",
	pron = "pronoun",
	prop = "proper noun",
	proper = "proper noun",
	onum = "ordinal number",
	v = "verb",
	vb = "verb",
	vi = "intransitive verb",
	vt = "transitive verb",
	vti = "transitive and intransitive verb",
}

-- Parts of speech for which categories like "German masculine nouns" or "Russian imperfective verbs"
-- will be generated if the headword is of the appropriate gender/number.
data.pos_for_gender_number_cat = {
	["nouns"] = "nouns",
	["proper nouns"] = "nouns",
	["suffixes"] = "suffixes",
	-- We include verbs because impf and pf are valid "genders".
	["verbs"] = "verbs",
}

------ 3. Page-wide processing (so that it only needs to be done once per page). ------
data.page = require(headword_page_module).process_page()
-- Fuckme, random references to data.pagename and data.encoded_pagename are scattered throughout the codebase. FIXME!
data.pagename = data.page.pagename
data.encoded_pagename = data.page.encoded_pagename

return data