มอดูล:zh-sortkey

จาก วิกิพจนานุกรม พจนานุกรมเสรี
ไปยังการนำทาง ไปยังการค้นหา

This module generates sortkeys for entry titles containing แม่แบบ:catlink. It is used for Chinese and for Vietnamese terms written in the Han script.

The demonstration functions that generated the content shown below are housed in Module:zh-sortkey/templates. Modifications to the module can be tested in Module:zh-sortkey/sandbox. Sortkeys for individual characters are retrieved from one of 178 data modules. Module:zh-sortkey/data creates documentation for these modules.

Show sortkeys[แก้ไข]

  • PS/2接口 (PS/2手08口00)
  • gas爐 (gas火16)
  • γ粒子 (γ米05子00)
  • 命裡有時終須有,命裡無時莫強求 (口05衣07月02日06糸05頁03月02,口05衣07火08日06艸07弓08水02)
  • 得個……字 (彳08人08……子03)
  • 濕𣲷𣲷 (水14水05水05)
  • 赛车 (贝10车00)
  • (乙01)
  • 𡆔 (口23)
  • 𡎇 (土09)
  • 阿坝 (阜05土04)

Ideographic description sequences[แก้ไข]

  • ⿰亻革 (人09)
  • ⿰亻革家語 (人09宀07言07)
  • ⿰石田 (石05)
  • ⿺辶⿳穴⿲月⿱⿲幺言幺⿲長馬長刂心⿺辶⿳穴⿲月⿱⿲幺言幺⿲長馬長刂心麵 (辵54辵54麥09)

Show data modules[แก้ไข]


แม่แบบ:module cat


local export = {}

local namespace = mw.title.getCurrentTitle().nsText

local substring = mw.ustring.sub

local function log(...)
	if namespace == "Module" then
		mw.log(...)
	end
end

--[[
	The number of characters or ideographic sequences that must follow each
	ideographic description character.
]]
local IDchars = {
	["⿰"] = 2,
	["⿱"] = 2,
	["⿲"] = 3,
	["⿳"] = 3,
	["⿴"] = 2,
	["⿵"] = 2,
	["⿶"] = 2,
	["⿷"] = 2,
	["⿸"] = 2,
	["⿹"] = 2,
	["⿺"] = 2,
	["⿻"] = 2,
	
	--[[
	-- in future perhaps: https://www.unicode.org/L2/L2018/18012-irgn2273-four-new-idcs.pdf
	[mw.ustring.char(0x2FFC)] = 2,
	[mw.ustring.char(0x2FFD)] = 2,
	[mw.ustring.char(0x2FFE)] = 1,
	[mw.ustring.char(0x2FFF)] = 1,
	--]]
}

--[[
	Returns the index in the string where the ideographic description sequence
	(IDS) ends, or the index of the end of the string. Iterates whenever
	another ideographic description character (IDC) is found.
]]
local function findEndOfIDS(text, IDchar, i)
	if not ( text and IDchar and i) then
		return nil
	end
	
	local j = i
	local component = 1
	
	-- Number of components expected after current IDC.
	local components = IDchars[IDchar]
	
	while component <= components do
		j = j + 1
		
		local char = substring(text, j, j)
		
		if char == "" then
			break
		elseif IDchars[char] then
			j = findEndOfIDS(text, char, j)
		end
		
		component = component + 1
	end
	
	--[[
		If the expected number of components has been found,
		return the current index in the text.
	]]
	if component - components == 1 then
		return j
	else
		return nil
	end
end

local function getFromModule(codepoint, start, returnModule)
	--[=[
		The sortkey modules handle two sets of codepoints.
		The first set runs from [[Module:zh-sortkey/data/001]]
		to [[Module:zh-sortkey/data/056]], then there is a gap
		of 90134 codepoints. The second set runs from
		[[Module:zh-sortkey/data/057]] to
		[[Module:zh-sortkey/data/177]].
	]=]
	local moduleStart = {
		[13312] = 1,
		[131072] = 57,
	}
	
	local moduleName = string.format(
		"Module:zh-sortkey/data/%03d",
		( codepoint - start ) / 500 + moduleStart[start]
	)
--	log(codepoint .. ": data module: " .. moduleName)
	
	if returnModule then
		return moduleName
	else
		local success, data = pcall(mw.loadData, moduleName)
		
		if success then
	--		log("success! ... " .. codepoint .. ": " .. tostring(data[codepoint]))
			return data[codepoint]
		else
	--		log("failure: " .. codepoint .. " (" .. mw.ustring.char(codepoint) .. ")")
			return nil
		end
	end
end

function export.getData(char, returnModule)
	if type(char) == "string" then
		char = mw.ustring.codepoint(char)
	elseif type(char) ~= number then
		error("getData must operate on a single character or codepoint.")
	end
	
--	log(char, mw.ustring.char(char))
	
	if char >= 13312 and char <= 40938 then
		return getFromModule(char, 13312, returnModule)
	elseif char >= 131072 and char <= 191456 then
		return getFromModule(char, 131072, returnModule)
	else
--		log("not in range: " .. char .. " (" .. mw.ustring.char(char) .. ")")
	end
	
	return nil
end

function export.makeSortKey(text, lang, sc)
	local allowed_langs = {
		zh = true,
		vi = true,
		ja = true,
	}
	if lang and not allowed_langs[lang] then
		return text
	end
	
	if sc and sc ~= "Hani" then
		return text
	end
	
	local sort = {}
	
	local i = 1
	while i <= mw.ustring.len(text) do
		local character = substring(text, i, i)
		--[=[
			If we encounter an ideographic description character (IDC,
			find out if it begins a valid ideographic description sequence (IDS).
			
			If the IDS is valid and a sortkey for it is listed in
			[[Module:zh-sortkey/data/unsupported]], then return
			the sortkey, and move to the next character after the
			IDS.
			
			Otherwise, ignore the IDC and move to the next character
			after it.
			
			If the IDS is valid and no sortkey for it is found, track it.
		]=]
		if IDchars[character] then
			local j = findEndOfIDS(text, character, i)
			local IDS, data
			if j then
				IDS = substring(text, i, j)
				data = mw.loadData("Module:zh-sortkey/data/unsupported")[IDS]
			end
			if IDS and not data then
				require("Module:debug").track("zh-sortkey/IDS-without-sortkey")
				mw.log("ideographic description sequence without sortkey: " .. IDS)
			end
			if IDS and data then
				table.insert(sort, data)
				i = j
			else
				table.insert(sort, character)
			end
		else
			table.insert(sort, export.getData(character) or character)
		end
		i = i + 1
	end
	
	sort = table.concat(sort)

	return sort
end

return export