Module:Unicode data/scripts/doc

From Nordic Larp Wiki
< Module:Unicode data‎ | scripts
Revision as of 03:14, 29 December 2018 by Johannes Axner (talk | contribs) (1 revision imported)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

This is the documentation page for Module:Unicode data/scripts

Usage

Extracted from Scripts.txt and PropertyValueAliases.txt from the Unicode Character Database using two scripts shown below, written in Lua 5.3 and using LPeg. The scripts must be in the same folder as the two data files, and you must have a global function named "sortedpairs" that can iterate over integer keys in a sparse table in numerical order. (The sortedPairs function in Module:table on English Wiktionary will work.)

Lua 5.3 scripts
make_script_data.lua
local lpeg = require 'lpeg'
local infilehandle = assert(io.open('./Scripts.txt', 'rb'))
local scriptdata = assert(infilehandle:read 'a')
infilehandle:close()
local outfile = './data.lua'

local script_name_to_code = dofile './name_to_code.lua'

for k, v in pairs(lpeg) do
	local firstletter = k:sub(1, 1)
	if firstletter:upper() == firstletter then
		_ENV[k] = v
	end
end

local function numtohex(number)
	return ('%04X'):format(number)
end

local function hextonum(hex)
	return tonumber(hex, 16)
end

-- Create table that contains arrays of codepoint ranges as well as codepoint-to-script fields.
-- Need separate tables for map and ranges.
local prev
local maxscriptnamelen = 0
local function process(t, cp1, cp2, scriptname, ...)
	if not scriptname then return t end
	
	local script_code = script_name_to_code[scriptname]
	
	local rangearray = t.ranges[script_code] -- Place in script-specific array initially.
	if not rangearray then
		rangearray = {}
		t.ranges[script_code] = rangearray
		prev = nil
	end
	local cpnumber1, cpnumber2
	cpnumber1 = hextonum(cp1)
	if cp2 then
		cpnumber2 = hextonum(cp2)
	end
	
	if prev and cpnumber1 == prev + 1 then
		if t.individual[prev] then -- Move individual condepoint to previously created range.
			t.individual[prev] = nil
			table.insert(rangearray, { prev, cpnumber2 or cpnumber1, script_code })
		else
			rangearray[#rangearray][2] = cpnumber2 or cpnumber1 -- Increment top of previous codepoint range.
		end
	else
		if cpnumber2 then
			table.insert(rangearray, { cpnumber1, cpnumber2, script_code })
		else
			t.individual[cpnumber1] = script_code
		end
	end
	prev = cpnumber2 or cpnumber1
	
	return t
end

local patt = P {
	Cf((Cc{ ranges = {}, individual = {} } * V 'patt' + 1)^1, process),
	patt       =  V 'nl'        * (Cg(V 'data_line') + V 'comment'),
	data_line  =  V 'cprange'   *  V 'opts'
		       *  P ';'         *  V 'opts'      * C(V 'scriptname') * V 'opts'
			   * (V 'count'     *  V 'opts')^-1
		       *  P '#'         *  V 'opts'      * V 'category'      * V 'opts'
			   *  V 'not_nl',
	comment    =  P '#'         *  V 'not_nl',
	
	count      = '[' * R '09'^1 * ']',
	category   = R 'AZ' * R('az', '&&'),
	scriptname = R('AZ', 'az', '__')^1, -- Actually starts with capital and rest is alphabetic or underscore.
	cprange    = C(V 'cp') * (P '..' * C(V 'cp') + Cc(nil)), -- XXXX; XXXX..XXXX -> string, string; string, nil
	
	not_nl     = (1 - V 'nl')^0,
	cp         = V 'hex' * V 'hex' * V 'hex' * V 'hex' * V 'hex'^-1,-- maximum of 5 hex digits
	hex        = R('09', 'AF', 'af'), -- Actually just 0-9A-F.
	opts       = S ' \t'^0, -- Actually just space.
	nl         = P '\r'^-1 * P '\n'
}

local scriptdatatable = patt:match(scriptdata)

-- Move arrays for individual scripts to a single array.
local i = 0
local new_ranges = {}
for script, ranges in pairs(scriptdatatable.ranges) do
	for _, range in ipairs(ranges) do
		i = i + 1
		new_ranges[i] = range
	end
end
scriptdatatable.ranges = new_ranges

table.sort(
	scriptdatatable.ranges,
	function (range1, range2)
		return range1[1] < range2[1]
	end)


-- P R I N T   R E S U L T
local out = assert(io.open(outfile, 'wb'))

out:write [[
-- Generated by make_script_data.lua.

return {
	individual = {
]]

for cp, script in sortedpairs(scriptdatatable.individual) do
	out:write(('\t\t[0x%05X] = "%s",\n'):format(cp, script))
end

out:write[[
	},
	
	ranges = {
]]

for _, range in ipairs(scriptdatatable.ranges) do
	out:write(('\t\t{ 0x%05X, 0x%05X, %-' .. maxscriptnamelen .. 's },\n')
		:format(range[1], range[2], ('%q'):format(range[3])))
end

out:write [[
	},
}
]]

assert(out:close())
name_to_code.lua
local lpeg = require 'lpeg'

local property_value_aliases_filename = "./PropertyValueAliases.txt"
local property_value_aliases = assert(io.open(property_value_aliases_filename, 'rb')):read('a')

for k, v in pairs(lpeg) do
	local firstletter = k:sub(1, 1)
	if firstletter:upper() == firstletter then
		_ENV[k] = v
	end
end

local script_name_to_code = {}

local function add_to_table(code, name)
	script_name_to_code[name] = code
end

local patt = P {
	(V 'script_line' / add_to_table + 1)^1,
	script_line = V 'nl' * P 'sc' * V 'sep' * C(V 'code') * V 'sep' * C(V 'name') * (P(1) - V 'nl')^0,
	code = R 'AZ' * V 'lower' * V 'lower' * V 'lower',
	name = R('AZ', 'az', '__')^1,
	lower = R 'az',
	sep = V 'w' * P ';' * V 'w',
	w = S ' \t'^0,
	nl = P '\r'^-1 * P '\n'
}

patt:match(property_value_aliases)

return script_name_to_code