Jump to content

Module:Lang/data/make is latn data

From Wikipedia, the free encyclopedia

require ('strict');

local title_object = mw.title.getCurrentTitle ();								-- get this module's title object
if not title_object.fullText:find ('/doc$') then								-- are we are looking at the ~/doc page or the module page?
	local module_doc_title = title_object.fullText .. '/doc';					-- looking at the module page so make a page name for this module's doc page
	title_object = mw.title.new (module_doc_title);								-- reset title object to this module's doc page
end
local content = title_object:getContent();										-- get the doc page content

local common_scripts_singles_t = {};											-- these used when constructing final output
local common_scripts_ranges_t = {};
local latn_scripts_singles_t = {};
local latn_scripts_ranges_t = {};
local extension_scripts_singles_t = {};
local extension_scripts_ranges_t = {};


--[[--------------------------< Z Y Y Y _ L A T N _ C O D E P O I N T S _ G E T >-----------------------------

extract zyyy-script (common) and latn codepoints from Module:Unicode data/scripts.  There are individual codepoints
and ranges of codepoints.

]]

local function zyyy_latn_codepoints_get ()
	local unicode_scripts = mw.loadData ('Module:Unicode data/scripts');

	for code_point, script in pairs (unicode_scripts.singles) do				-- spin through the ~/scripts.singles table
		if 'Latn' == script then
			latn_scripts_singles_t[code_point] = true;							-- not a sequence so we can check for duplicates later
		elseif 'Zyyy' == script then
			common_scripts_singles_t[code_point] = true;						-- not a sequence so we can check for duplicates later
		end
	end
	
	for i, code_points_t in ipairs (unicode_scripts.ranges) do					-- spin through the ~/scripts.ranges table
		if 'Latn' == code_points_t[3] then
			table.insert (latn_scripts_ranges_t, {code_points_t[1], code_points_t[2]});
		elseif 'Zyyy' == code_points_t[3] then
			table.insert (common_scripts_ranges_t, {code_points_t[1], code_points_t[2]});
		end
	end
end


--[[--------------------------< E X T E N S I O N _ C O D E P O I N T S _ G E T >-----------------------------

read a local copy of the current unicode ScriptExtensions-xx.x.x.txt file (hidden in this module's doc page).
extract latn-script codepoints and ranges.  Convert codepoints from hex to decimal (same format as codepoints
extracted from Unicode data/scripts).

]]

local function extension_codepoints_get ()
	local line_pattern = '%x+[^\r\n]+';

	for line in content:gmatch (line_pattern) do								-- read each line of extensions text file
		local single = line:match ('(%x+)%s*;[^#]*Latn[^#]*#%s*%a%a%s*(.+)');
		if single then
			extension_scripts_singles_t[tonumber ('0x' .. single)] = true;		-- convert hex index to decimal and save
		end

		local range_t = {};														-- a single codepoint-range
		range_t[1], range_t[2] = line:match ('(%x+)%.%.(%x+)%s*;[^#]*Latn[^#]*#%s*%a%a%s*%[%d+%]%s*(.+)');
		if range_t[1] then
			range_t[1] = tonumber ('0x' .. range_t[1]);							-- convert hex index to decimal
			range_t[2] = tonumber ('0x' .. range_t[2]);							-- convert hex index to decimal
			table.insert (extension_scripts_ranges_t, range_t);					-- and save
		end
	end
end


--[[--------------------------< B I N A R Y _ S E A R C H >---------------------------------------------------
]]

local function binary_search (target, ranges_t)
	local idx_bot = 1;															-- initialize to index of first key
	local idx_top = #ranges_t;													-- initialize to index of last key (number of keys)

	if (target < ranges_t[idx_bot][1]) or (target > ranges_t[idx_top][2]) then	-- invalid; target out of range
		return;																	-- TODO: return something meaningful?
	end

	local idx_mid;
	local flag = false;

	while 1 do
		idx_mid = math.ceil ((idx_bot + idx_top) / 2);							-- get the mid-point in the sequence
		if (target >= ranges_t[idx_mid][1]) and (target <= ranges_t[idx_mid][2]) then		-- indexed low value <= target <= indexed high value
			return true;														-- we found the range that holds the <target> character; return true
		
		elseif (target > ranges_t[idx_mid][2]) then								-- is <target> > indexed high value?
			idx_bot = idx_mid;													-- adjust <idx_bot> up

		else																	-- here when <target> less than indexed low value
			idx_top = idx_mid - 1;												-- adjust <idx_top> down
		end

		if flag then
			break;																-- here when we just evaluated the last range and <target> not found
		end
	
		if not flag and (idx_bot == idx_top) then								-- set true just before we evaluate the last range
			flag = true;
		end
	end
end


--[[--------------------------< E X P A N D _ R A N G E >-----------------------------------------------------

expand range <range_t>[1] to <range_t>[2] into <out_t> as singles:
	{10, 15} -> {10, 11, 12, 13, 14, 15}
	
]]

local function expand_range (range_t, out_t)
	for i=range_t[1], range_t[2] do
		table.insert (out_t, i);
	end
end


--[[--------------------------< M A K E _ R A N G E S _ F R O M _ S I N G L E S >------------------------------

search <scripts_singles_t> and for ranges of contiguous codepoints to be added to the ranges list.  Singles
ranges added to the ranges will be removed from the final singles list later during output formatting.

]]

local function make_ranges_from_singles (scripts_singles_t, ranges_from_singles_t)
	local singles_t = {};														-- sequence of singles suitable for sorting
	for k, _ in pairs (scripts_singles_t) do
		table.insert (singles_t, k);											-- add codepoint to singles_t
	end
	
	table.sort (singles_t);														-- ascending sort to get them all in increasing order
	local bottom, top;															-- bottom and top of extracted range

	for i, single in ipairs (singles_t) do
		if not bottom and (single + 1) == singles_t[i+1] then					-- if (singles_t[i]+1) same value as next element ([i+1])
			bottom = single;													-- set new range bottom value
			top = singles_t[i+1];												-- set new range top value
			table.remove (singles_t, i);										-- remove range bottom value from table (<i> now indexes top range value)

			while (top + 1) == singles_t[i+1] do								-- if (singles_t[i]+1) same value as next element ([i+1])
				top = singles_t[i+1];											-- set new top
				table.remove (singles_t, i);									-- remove range bottom value from table (<i> now indexes new top range value)
			end
		end
		
		if bottom then															-- not nil when we have extracted a range
			mw.log (string.format ('%s–%s (%.4X..%.4X) extracted from singles_t', bottom, top, bottom, top));
			table.insert (ranges_from_singles_t, {bottom, top})					-- save the extracted range
			bottom = nil;														-- unset these for the next range
			top = nil;
		end
	end
end


--[[--------------------------< M A I N >---------------------------------------------------------------------

{{#invoke:Sandbox/trappist the monk/is latn|main}}

build composite lists (single and ranges) of common- and latn-script codepoints.

Duplicates are singles and ranges swallowed.

When a range has a different length from another range with the same starting point, this function takes the
longest range.

When a range is a subset of a larger range, the subset range is removed from the list.

Contiguous ranges (ending codepoint of one range is one less than the starting codepoint of the next range) are
joined to make a single range.

Expands all ranges into singles and combines with separately defined singles to create one long list of singles
because why not?

Finally the lists are made all pretty-like and rendered for copy pasta into an appropriate data module for use
by Module:Lang.

TODO: detect and remove overlapping ranges where one range starts in one range and ends in another range?
TODO: there are contiguous codepoints listed in the singles list; combine these into ranges

]]

local function main (frame)
	zyyy_latn_codepoints_get();													-- get common- and latn-script codepoints from [[Module:Unicode data/scripts]]
	extension_codepoints_get();													-- get latn-script codepoints from local copy of unicode scripts text file
	
	local scripts_singles_t = {};
	for _, scripts_t in ipairs ({latn_scripts_singles_t, common_scripts_singles_t, extension_scripts_singles_t}) do
		for k, v in pairs (scripts_t) do
			scripts_singles_t[k] = v;											-- duplicates (if any) are swallowed
		end
	end

	local ranges_from_singles_t = {};											-- a sequence of sequences
	make_ranges_from_singles (scripts_singles_t, ranges_from_singles_t);		-- add contiguous singles in <scripts_singles_t> to <ranges_from_singles_t>

	local temp_t = {};															-- for ranges; <k> is range low value, <v> is range high value
	for _, ranges_t in ipairs ({latn_scripts_ranges_t, common_scripts_ranges_t, extension_scripts_ranges_t, ranges_from_singles_t}) do
		for _, range_t in pairs (ranges_t) do
			if temp_t[range_t[1]] then
				if temp_t[range_t[1]] ~= range_t[2] then
					mw.log (range_t[1] .. '–' .. range_t[2] .. string.format (' (%x..%x) ', range_t[1], range_t[2]) .. 'does not match: ' .. temp_t[range_t[1]] .. string.format (' (%x)', temp_t[range_t[1]]));
					if temp_t[range_t[1]] > range_t[2] then
						range_t[2] = temp_t[range_t[1]];						-- use the greater high value
					end
				else
					mw.log (range_t[1] .. '–' .. range_t[2] .. string.format (' (%x..%x) ', range_t[1], range_t[2]) .. ' is duplicate');	-- log and overwrite existing range
				end
			end
			temp_t[range_t[1]] = range_t[2];									-- add to temp table
		end
	end

	local scripts_ranges_t = {};
	for k, v in pairs (temp_t) do												-- make a sequence of codepoint range sequences
		table.insert (scripts_ranges_t, {k, v});
	end
	local function sort (a_t, b_t)												-- local function to ascending sort range tables
		return a_t[1] < b_t[1];
	end
	
	table.sort (scripts_ranges_t, sort);										-- ascending sort the range sequence

	for k, v_t in ipairs (scripts_ranges_t) do									-- remove ranges that are subsets of other ranges; must be sorted first
		if k == #scripts_ranges_t then
			break;																-- done because there is no scripts_ranges_t[k+1]
		end
		
		if (scripts_ranges_t[k+1][1] < v_t[2]) and (scripts_ranges_t[k+1][2] < v_t[2]) then	-- next range start and end less than current range end
			mw.log ('removed subrange' .. scripts_ranges_t[k+1][1] .. '–' .. scripts_ranges_t[k+1][2] .. string.format (' (%x..%x) ', scripts_ranges_t[k+1][1], scripts_ranges_t[k+1][2]));
			table.remove (scripts_ranges_t, k+1)
		end
	end
	
	local i = 1;																-- indexer
	while i ~= #scripts_ranges_t do												-- join contiguous ranges into a single range;
		if (scripts_ranges_t[i][2] + 1) == scripts_ranges_t[i+1][1] then		-- example: if {0, 64+1} == {65, 90} then join
			mw.log (string.format ('joined: %s..%s and %s..%s', scripts_ranges_t[i][1], scripts_ranges_t[i][2], scripts_ranges_t[i+1][1], scripts_ranges_t[i+1][2]))
			scripts_ranges_t[i][2] = scripts_ranges_t[i+1][2];					-- join
			table.remove (scripts_ranges_t, i+1);								-- remove joined
		else
			i = i+1;															-- not contiguous, bump the indexer
		end
	end

	local singles_out_t = {};													-- sequence to hold singles_out_t
	local expanded_out_t = {};													-- sequence to hold singles + explanded ranges
	for k, _ in pairs (scripts_singles_t) do
		if binary_search (k, scripts_ranges_t) then								-- omit singles that are included in ranges_out_t
			mw.log (string.format ('removed: %s (%X)', k, k));
		else
			table.insert (singles_out_t, k);
			table.insert (expanded_out_t, k);
		end
	end		
	table.sort (singles_out_t);
	for i, v in ipairs (singles_out_t) do
		local single_str = string.format ('[%s] = true,', v);
		local rep = math.ceil ((80 - (4 + single_str:len())) / 4);
		singles_out_t[i] = string.format ('\t%s%s-- %.4X', single_str, string.rep ('\t', rep), v);
	end		
	table.insert (singles_out_t, 1, '<syntaxhighlight lang="lua">local singles_t = {');	-- opening stuff
	table.insert (singles_out_t, '\t}</syntaxhighlight>');						-- to close the table

	local ranges_out_t = {};
	for _, v_t in ipairs (scripts_ranges_t) do
		local range_str = string.format ('{%s, %s},', v_t[1], v_t[2]);
		local rep = math.ceil ((80 - (4 + range_str:len())) / 4);
		table.insert (ranges_out_t, string.format ('\t%s%s-- %.4X..%.4X', range_str, string.rep ('\t', rep), v_t[1], v_t[2]));
		expand_range (v_t, expanded_out_t);										-- expand this range into <expanded_out_t>
	end		
	table.insert (ranges_out_t, 1, '<syntaxhighlight lang="lua">local ranges_t = {');	-- opening stuff
	table.insert (ranges_out_t, '\t}</syntaxhighlight>');						-- to close the table

	table.sort (expanded_out_t);
	for i, v in ipairs (expanded_out_t) do
		local single_str = string.format ('[%s] = true,', v);
		local rep = math.ceil ((80 - (4 + single_str:len())) / 4);
		expanded_out_t[i] = string.format ('\t%s%s-- %.4X', single_str, string.rep ('\t', rep), v);
	end		
	table.insert (expanded_out_t, 1, '<syntaxhighlight lang="lua">local singles_t = {');	-- opening stuff
	table.insert (expanded_out_t, '\t}</syntaxhighlight>');						-- to close the table
	
	return frame:preprocess (table.concat ({									-- make a big string and done
		table.concat (singles_out_t, '\n'),
		'\n\n',
		table.concat (ranges_out_t, '\n'),
		'\n\n',
		table.concat (expanded_out_t, '\n'),
		}));
end


--[[--------------------------< E X P O R T S >---------------------------------------------------------------
]]

return {
	main = main,
	}