Module:Unicode data/scripts/doc
This is the documentation page for Module:Unicode data/scripts
This Lua module is used on approximately 260,000 pages. To avoid large-scale disruption and unnecessary server load, any changes to it should first be tested in the module's /sandbox or /testcases subpages. The tested changes can then be added to this page in a single edit. Please consider discussing changes on the talk page before implementing them. |
Usage
Extracted from Scripts.txt and PropertyValueAliases.txt from the Unicode Character Database using two scripts shown below, written in Lua 5.3 and using LPeg. The scripts must be in the same folder as the two data files, and you must have a global function named "sortedpairs" that can iterate over integer keys in a sparse table in numerical order. (The sortedPairs function in Module:table on English Wiktionary will work.)
Lua 5.3 scripts |
---|
local lpeg = require 'lpeg'
local infilehandle = assert(io.open('./Scripts.txt', 'rb'))
local scriptdata = assert(infilehandle:read 'a')
infilehandle:close()
local outfile = './data.lua'
local script_name_to_code = dofile './name_to_code.lua'
for k, v in pairs(lpeg) do
local firstletter = k:sub(1, 1)
if firstletter:upper() == firstletter then
_ENV[k] = v
end
end
local function numtohex(number)
return ('%04X'):format(number)
end
local function hextonum(hex)
return tonumber(hex, 16)
end
-- Create table that contains arrays of codepoint ranges as well as codepoint-to-script fields.
-- Need separate tables for map and ranges.
local prev
local maxscriptnamelen = 0
local function process(t, cp1, cp2, scriptname, ...)
if not scriptname then return t end
local script_code = script_name_to_code[scriptname]
local rangearray = t.ranges[script_code] -- Place in script-specific array initially.
if not rangearray then
rangearray = {}
t.ranges[script_code] = rangearray
prev = nil
end
local cpnumber1, cpnumber2
cpnumber1 = hextonum(cp1)
if cp2 then
cpnumber2 = hextonum(cp2)
end
if prev and cpnumber1 == prev + 1 then
if t.individual[prev] then -- Move individual condepoint to previously created range.
t.individual[prev] = nil
table.insert(rangearray, { prev, cpnumber2 or cpnumber1, script_code })
else
rangearray[#rangearray][2] = cpnumber2 or cpnumber1 -- Increment top of previous codepoint range.
end
else
if cpnumber2 then
table.insert(rangearray, { cpnumber1, cpnumber2, script_code })
else
t.individual[cpnumber1] = script_code
end
end
prev = cpnumber2 or cpnumber1
return t
end
local patt = P {
Cf((Cc{ ranges = {}, individual = {} } * V 'patt' + 1)^1, process),
patt = V 'nl' * (Cg(V 'data_line') + V 'comment'),
data_line = V 'cprange' * V 'opts'
* P ';' * V 'opts' * C(V 'scriptname') * V 'opts'
* (V 'count' * V 'opts')^-1
* P '#' * V 'opts' * V 'category' * V 'opts'
* V 'not_nl',
comment = P '#' * V 'not_nl',
count = '[' * R '09'^1 * ']',
category = R 'AZ' * R('az', '&&'),
scriptname = R('AZ', 'az', '__')^1, -- Actually starts with capital and rest is alphabetic or underscore.
cprange = C(V 'cp') * (P '..' * C(V 'cp') + Cc(nil)), -- XXXX; XXXX..XXXX -> string, string; string, nil
not_nl = (1 - V 'nl')^0,
cp = V 'hex' * V 'hex' * V 'hex' * V 'hex' * V 'hex'^-1,-- maximum of 5 hex digits
hex = R('09', 'AF', 'af'), -- Actually just 0-9A-F.
opts = S ' \t'^0, -- Actually just space.
nl = P '\r'^-1 * P '\n'
}
local scriptdatatable = patt:match(scriptdata)
-- Move arrays for individual scripts to a single array.
local i = 0
local new_ranges = {}
for script, ranges in pairs(scriptdatatable.ranges) do
for _, range in ipairs(ranges) do
i = i + 1
new_ranges[i] = range
end
end
scriptdatatable.ranges = new_ranges
table.sort(
scriptdatatable.ranges,
function (range1, range2)
return range1[1] < range2[1]
end)
-- P R I N T R E S U L T
local out = assert(io.open(outfile, 'wb'))
out:write [[
-- Generated by make_script_data.lua.
return {
individual = {
]]
for cp, script in sortedpairs(scriptdatatable.individual) do
out:write(('\t\t[0x%05X] = "%s",\n'):format(cp, script))
end
out:write[[
},
ranges = {
]]
for _, range in ipairs(scriptdatatable.ranges) do
out:write(('\t\t{ 0x%05X, 0x%05X, %-' .. maxscriptnamelen .. 's },\n')
:format(range[1], range[2], ('%q'):format(range[3])))
end
out:write [[
},
}
]]
assert(out:close())
local lpeg = require 'lpeg'
local property_value_aliases_filename = "./PropertyValueAliases.txt"
local property_value_aliases = assert(io.open(property_value_aliases_filename, 'rb')):read('a')
for k, v in pairs(lpeg) do
local firstletter = k:sub(1, 1)
if firstletter:upper() == firstletter then
_ENV[k] = v
end
end
local script_name_to_code = {}
local function add_to_table(code, name)
script_name_to_code[name] = code
end
local patt = P {
(V 'script_line' / add_to_table + 1)^1,
script_line = V 'nl' * P 'sc' * V 'sep' * C(V 'code') * V 'sep' * C(V 'name') * (P(1) - V 'nl')^0,
code = R 'AZ' * V 'lower' * V 'lower' * V 'lower',
name = R('AZ', 'az', '__')^1,
lower = R 'az',
sep = V 'w' * P ';' * V 'w',
w = S ' \t'^0,
nl = P '\r'^-1 * P '\n'
}
patt:match(property_value_aliases)
return script_name_to_code
|