#!/usr/bin/lua -- Split the Unicode Character database and transforms it to SQL -- requests to populate a database -- http://www.unicode.org/Public/UNIDATA/UCD.html -- To get the required files: -- wget -r http://www.unicode.org/Public/UNIDATA/ -- TODO: -- * SpecialCasing (partly done with CaseFolding) require "lfs" -- Lua file system https://keplerproject.github.io/luafilesystem/ debug = false -- Backslash-escape special characters for SQL function addslashes(s) -- Double quote, single quote, and backslash s = string.gsub(s, "(['\"\\])", "\\%1") return (string.gsub(s, "%z", "\\0")) end -- addslashes function delunderscores(s) s = string.gsub(s, "_", " ") return s end -- addslashes -- http://www.lua.org/pil/20.3.html function trim (s) return (string.gsub(s, "^%s*(.-)%s*$", "%1")) end function findfile(dir, beginning) length = string.len(beginning) for file in lfs.dir(dir) do filename = tostring(file) if string.sub(filename,1,length) == beginning then return (dir .. "/" .. filename) end end error(string.format("Cannot found %s in directory %s", beginning, dir)) end function parse_decomposition(db_entry) -- See -- http://www.unicode.org/Public/UNIDATA/UCD.html--Character_Decomposition_Mappings -- "Where no such tag is given, the mapping is canonical. Conversely, -- the presence of a formatting tag also indicates that the mapping is -- a compatibility mapping and not a canonical mapping." Examples: -- 004C 00B7 -- 004C 030C -- 0061 -- TODO: No enumerated types in Lua ? -- Warning, string functions in Lua do not operate on regexps but on -- much simpler patterns. The following patterns are way too lax, they -- will accept too many things. theclass = nil thevalues = {} for class in string.gmatch(db_entry, "<(%l+)>") do theclass = class end start_data, end_data = string.find(db_entry, "<(%l+)>") if end_data ~= nil then db_entry = string.sub(db_entry, end_data+1) end for value in string.gmatch(db_entry, "%s*(%x+)") do int_value = tonumber("0x" .. value) table.insert(thevalues, int_value) end return theclass, thevalues end function parse_casefolding(db_entry) thevalues = {} for value in string.gmatch(db_entry, "%s*(%x+)") do int_value = tonumber("0x" .. value) table.insert(thevalues, int_value) end return thevalues end assert(#arg == 1, string.format("Usage: %s UCD-directory-name", arg[0])) directory = arg[1] print ("BEGIN;") -- Categories, Bidi classes... propaliasesfile = findfile(directory, "PropertyValueAliases") for line in io.lines(propaliasesfile) do fields = {} for field in string.gmatch(line, " *([%a_/-]+) *;?") do table.insert(fields, field) end if fields[1] == "gc" then -- Category category = tostring(fields[2]) categoryname = tostring(fields[3]) print(string.format("INSERT INTO Categories (name, description) VALUES ('%s', '%s');", category, addslashes(categoryname))) elseif fields[1] == "bc" then -- Bidi class bidiclass = tostring(fields[2]) bidiname = tostring(fields[3]) print(string.format("INSERT INTO BidiClasses (name, description) VALUES ('%s', '%s');", bidiclass, bidiname)) elseif fields[1] == "blk" then -- Blocks block = tostring(fields[3]) print(string.format("INSERT INTO Blocks (name) VALUES ('%s');", delunderscores(block))) end end -- Properties propfile = findfile(directory, "PropertyAliases") for line in io.lines(propfile) do comment = string.find(line, " *#") length = string.len(line) if comment ~= 1 and length > 0 then fields = {} for field in string.gmatch(line, " *([%a%d_/-]+) *;?") do table.insert(fields, field) end print(string.format("INSERT INTO Properties (shorthand, description) VALUES ('%s', '%s');", fields[1], fields[2])) end end -- Characters charactersfile = findfile(directory, "UnicodeData") lineno = 0 in_range = nil end_of_range = false for line in io.lines(charactersfile) do fields = {} lineno = lineno + 1 for field in string.gmatch(line, "([%a%d%s%<%>%-%,]*);?") do table.insert(fields, field) end hexcodepoint = fields[1] codepoint = tonumber("0x" .. hexcodepoint) name = tostring(fields[2]) category = fields[3] bidiclass = fields[5] decomposition_type, decomposition = parse_decomposition(fields[6]) uppercase = fields[13] if uppercase == "" then uppercase = nil end lowercase = fields[14] if lowercase == "" then lowercase = nil end titlecase = fields[15] if titlecase == "" then titlecase = nil end if uppercase ~= nil and titlecase == nil then titlecase = uppercase end if uppercase ~= nil then uppercase = tonumber("0x" .. uppercase) end if titlecase ~= nil then titlecase = tonumber("0x" .. titlecase) end if lowercase ~= nil then lowercase = tonumber("0x" .. lowercase) end -- Underdocumented convention for ranges of characters if string.sub(name, -6) == "First>" then if in_range ~= nil then error(string.format("First> seen at line %i while already in a character range", lineno)) end in_range = codepoint elseif string.sub(name, -5) == "Last>" then if in_range == nil then error(string.format("Last> seen at line %i while not in a character range", lineno)) end name = string.sub(name, 2, -8) -- Drop the suffix and the leading < end_of_range = true end if in_range == nil then print(string.format("INSERT INTO Characters (codepoint, name, category, bidiclass) VALUES (%i, '%s', '%s', '%s'); -- U+%s", codepoint, addslashes(name), category, bidiclass, hexcodepoint)) if uppercase ~= nil then print(string.format("UPDATE Characters SET uppercase=%i WHERE codepoint=%i;", uppercase, codepoint)) end if titlecase ~= nil then print(string.format("UPDATE Characters SET titlecase=%i WHERE codepoint=%i;", titlecase, codepoint)) end if lowercase ~= nil then print(string.format("UPDATE Characters SET lowercase=%i WHERE codepoint=%i;", lowercase, codepoint)) end if decomposition_type ~= nil then print(string.format("UPDATE Characters SET decomposition_type='%s' WHERE codepoint=%i;", decomposition_type, codepoint)) end if #decomposition > 0 then list = "" for i = 1,#decomposition do if i ~= 1 then list = list .. ", " end list = list .. decomposition[i] end print(string.format("UPDATE Characters SET decomposition='{%s}' WHERE codepoint=%i;", list, codepoint)) end elseif end_of_range then if (string.sub(name, 1, 11) == "Private Use") or (string.sub(name, -11) == "Private Use") or (string.sub(name, -9) == "Surrogate") then print(string.format("-- Ignoring range %s from %i to %i", name, in_range, codepoint)) else print(string.format("-- Range %s from %i to %i", name, in_range, codepoint)) for i = in_range,codepoint do -- TODO: better synthetized names, for instance for Hangul syllabes, the -- (complicated) algorithm is specified in the Unicode standard -- See http://www.unicode.org/mail-arch/unicode-ml/y2007-m09/0015.html synthetizedname = string.upper(string.format("%s-%s", name, string.format("%X", i))) print(string.format("INSERT INTO Characters (codepoint, name, category, bidiclass) VALUES (%i, '%s', '%s', '%s');", i, addslashes(synthetizedname), category, bidiclass)) end end in_range = nil end_of_range = false end if debug then for i,field in ipairs(fields) do print(i, field) end end end -- Unified Han Characters properties hanfile = findfile(directory, "Unihan") lineno = 0 alreadyseen = {} for line in io.lines(hanfile) do comment = string.find(line, " *#") length = string.len(line) lineno = lineno + 1 if comment ~= 1 and length > 0 then fields = {} -- Tabulation-separated fields for field in string.gmatch(line, "([^\009]+)") do table.insert(fields, field) end if not fields[3] then error(string.format("Impossible to parse line %i (\"%s\") of Unihan", lineno, line)) end hexcodepoint = string.sub(fields[1],3,string.len(fields[1])) codepoint = tonumber("0x" .. hexcodepoint) name = tostring(fields[2]) value = tostring(fields[3]) if not alreadyseen[hexcodepoint] then alreadyseen[hexcodepoint] = 1 print(string.format("INSERT INTO Han_Properties (codepoint) VALUES (%i);", codepoint)) end if name == "kDefinition" then print(string.format("UPDATE Han_Properties SET Definition='%s' WHERE codepoint = %i;", addslashes(value), codepoint)) elseif name == "kTotalStrokes" then print(string.format("UPDATE Han_Properties SET TotalStrokes=%i WHERE codepoint = %i;", tonumber(value), codepoint)) end end end -- Age (Unicode version) agefile = findfile(directory, "DerivedAge") lineno = 1 for line in io.lines(agefile) do comment = string.find(line, " *#") length = string.len(line) if comment ~= 1 and length > 0 then fields = {} for field in string.gmatch(line, " *([%a%d%.]+) *[;#]") do table.insert(fields, field) end if not fields[1] then error(string.format("No codepoint at line %i", lineno)) end version = tostring(fields[2]) start,stop = string.find(tostring(fields[1]), "..", 1, true) if start and stop then codepointstarttext = string.sub(tostring(fields[1]), 1, start-1) codepointstart = tonumber("0x" .. codepointstarttext) if not codepointstart then error(string.format("Cannot convert \"%s\" at line %i to a (starting) code point", tostring(codepointstarttext), lineno)) end codepointstoptext = string.sub(tostring(fields[1]), stop+1) codepointstop = tonumber("0x" .. codepointstoptext) if not codepointstop then error(string.format("Cannot convert \"%s\" at line %i to a (ending) code point", codepointstoptext, lineno)) end print(string.format("UPDATE Characters SET version='%s' WHERE codepoint>=%s AND codepoint<=%s;", version, codepointstart, codepointstop)) else codepoint = tonumber("0x" .. tostring(fields[1])) if not codepoint then error(string.format("Cannot convert \"%s\" at line %i to a code point", tostring(fields[1]), lineno)) end print(string.format("UPDATE Characters SET version='%s' WHERE codepoint=%i;", version, codepoint)) end end lineno = lineno + 1 end -- Properties proplistfile = findfile(directory, "PropList") lineno = 0 for line in io.lines(proplistfile) do lineno = lineno + 1 comment = string.find(line, " *#") length = string.len(line) if comment ~= 1 and length > 0 then fields = {} for field in string.gmatch(line, " *([%a%d%._]+) *;?") do table.insert(fields, field) end property = fields[2] start,stop = string.find(tostring(fields[1]), "..", 1, true) if start and stop then codepointstarttext = string.sub(tostring(fields[1]), 1, start-1) codepointstart = tonumber("0x" .. codepointstarttext) if not codepointstart then error(string.format("Cannot convert \"%s\" at line %i to a (starting) code point", tostring(codepointstarttext), lineno)) end codepointstoptext = string.sub(tostring(fields[1]), stop+1) codepointstop = tonumber("0x" .. codepointstoptext) if not codepointstop then error(string.format("Cannot convert \"%s\" at line %i to a (ending) code point", codepointstoptext, lineno)) end for cp = codepointstart,codepointstop do print(string.format("INSERT INTO Characters_Properties (property, codepoint) VALUES ('%s', %i);", property, cp)) end else codepoint = tonumber("0x" .. tostring(fields[1])) if not codepoint then error(string.format("Cannot convert \"%s\" at line %i to a code point", tostring(fields[1]), lineno)) end print(string.format("INSERT INTO Characters_Properties (property, codepoint) VALUES ('%s', %i);", property, codepoint)) end end end -- CaseFolding casefoldlistfile = findfile(directory, "CaseFolding") lineno = 0 for line in io.lines(casefoldlistfile) do lineno = lineno + 1 comment = string.find(line, " *#") length = string.len(line) if comment ~= 1 and length > 0 then fields = {} for field in string.gmatch(line, " *([%a%d%._ ]+) *;?") do table.insert(fields, field) end hexcodepoint = fields[1] codepoint = tonumber("0x" .. hexcodepoint) status = trim(fields[2]) casefolding = parse_casefolding(fields[3]) -- Today, we keep full case folding if #casefolding > 0 and (status == 'C' or status == 'F') then list = "" for i = 1,#casefolding do if i ~= 1 then list = list .. ", " end list = list .. casefolding[i] end print(string.format("UPDATE Characters SET casefolding='{%s}' WHERE codepoint=%i; -- U+%s", list, codepoint, hexcodepoint)) end end end -- Assignment to Blocks blocksfile = findfile(directory, "Blocks") lineno = 0 for line in io.lines(blocksfile) do lineno = lineno + 1 comment = string.find(line, " *#") length = string.len(line) if comment ~= 1 and length > 0 then fields = {} for field in string.gmatch(line, " *([%a%d%._ ]+) *;?") do table.insert(fields, field) end blockname = trim(fields[2]) start,stop = string.find(tostring(fields[1]), "..", 1, true) if start and stop then codepointstarttext = string.sub(tostring(fields[1]), 1, start-1) codepointstart = tonumber("0x" .. codepointstarttext) if not codepointstart then error(string.format("Cannot convert \"%s\" at line %i to a (starting) code point", tostring(codepointstarttext), lineno)) end codepointstoptext = string.sub(tostring(fields[1]), stop+1) codepointstop = tonumber("0x" .. codepointstoptext) if not codepointstop then error(string.format("Cannot convert \"%s\" at line %i to a (ending) code point", codepointstoptext, lineno)) end print(string.format("UPDATE Characters SET block=(SELECT id FROM Blocks WHERE name='%s') WHERE codepoint>=%s AND codepoint<=%s;", blockname, codepointstart, codepointstop)) end end end print ("COMMIT;")