summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Silverstone <dsilvers@digital-scurf.org>2016-07-30 14:28:53 +0100
committerDaniel Silverstone <dsilvers@digital-scurf.org>2016-07-30 14:28:53 +0100
commite1b5ff2a403e83a52926af9cebce30d4013944d7 (patch)
treecc1a2aa3f62b89d7bb3e05a4cd2d4a813c1a20db
parent607b32c69aab063bd65ac948f486d31f027c617e (diff)
downloadtongue-e1b5ff2a403e83a52926af9cebce30d4013944d7.tar.bz2
Some basic transliteration support, basic tests, not all yet
-rw-r--r--Makefile2
-rw-r--r--lib/tongue.lua6
-rw-r--r--lib/tongue/transliteration.lua132
-rw-r--r--lib/tongue/util.lua42
-rw-r--r--test/test-tongue.lua10
5 files changed, 191 insertions, 1 deletions
diff --git a/Makefile b/Makefile
index 7f562e6..36621b6 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
all: test doc
-MODULES := tongue tongue.langpack
+MODULES := tongue tongue.langpack tongue.util tongue.transliteration
LUA_VER := 5.1
PREFIX ?= /usr/local
diff --git a/lib/tongue.lua b/lib/tongue.lua
index a22bff0..7e77107 100644
--- a/lib/tongue.lua
+++ b/lib/tongue.lua
@@ -16,6 +16,8 @@ local _ABI = 1
local VERSION = "Tongue Version " .. tostring(_VERSION)
local langpack = require "tongue.langpack"
+local transliteration = require "tongue.transliteration"
+local util = require "tongue.util"
return {
_VERSION = _VERSION,
@@ -23,4 +25,8 @@ return {
_ABI = _ABI,
ABI = ABI,
langpack = langpack,
+ transliteration = transliteration,
+ translit = transliteration,
+ t13n = transliteration,
+ util = util,
}
diff --git a/lib/tongue/transliteration.lua b/lib/tongue/transliteration.lua
new file mode 100644
index 0000000..8ac2318
--- /dev/null
+++ b/lib/tongue/transliteration.lua
@@ -0,0 +1,132 @@
+-- lib/tongue/transliteration.lua
+--
+-- Lua I18N library 'Tongue' - Transliteration of strings
+--
+-- Copyright 2016 Daniel Silverstone <dsilvers@digital-scurf.org>
+--
+-- For licence terms, see COPYING
+--
+
+--- Tongue language packs are internally always in UTF-8, but users may need
+-- different encodings.
+--
+-- Since users might have all sorts of ways of specifying the desired character
+-- encoding for their messages, Tongue provides a mechanism for deriving the
+-- target character encoding and then transliterating to and from that
+-- encoding.
+--
+-- @module tongue.transliteration
+
+local iconv = require "iconv"
+local util = require "tongue.util"
+
+local converter = {}
+
+--- Tongue character-set converter
+--
+-- Tongue deals internally in UTF-8 but may have to handle input and output
+-- in any character set a user may choose. The converter object wrappers
+-- a pair of iconv descriptors which manage that conversion.
+--
+--
+-- @type converter
+
+--- Convert a string to the user character set.
+--
+-- @tparam string input The input (UTF-8) string
+-- @treturn string The output (user charset) string
+-- @function touser
+function converter:touser(input)
+ local s, v = self._touser:iconv(input)
+ return s or ("Error " .. tostring(v))
+end
+
+--- Convert a string from the user character set.
+--
+-- @tparam string input The input (user charset) string
+-- @treturn string The output (UTF-8) string
+-- @function fromuser
+function converter:fromuser(input)
+ local s, v = self._fromuser:iconv(input)
+ return s or ("Error " .. tostring(v))
+end
+
+local converter_mt = {__index=converter}
+
+---
+-- @section tongue.transliteration
+
+--- Retrieve a tongue encoding converter.
+--
+-- Construct and return an encoder which can convert between the provided
+-- encoding and UTF-8 in either direction. The converter will be configured to
+-- transliterate where possible and to replace bad or unknown codepoints so as
+-- to ensure that the outputs are always valid.
+--
+-- If the desired encoding is UTF-8 then the encoder returned shall effectively
+-- be a passthrough, excepting that invalid or malformed codepoints shall be
+-- "cleaned up" by the encoder object.
+--
+-- @tparam string encoding The desired encoding to be used
+-- @treturn encoder The bidirectional character encoder
+-- @function get
+local function get_converter(encoding)
+ local conv = {
+ _touser = iconv.open(encoding .. "//TRANSLIT//IGNORE", "UTF-8"),
+ _fromuser = iconv.open("UTF-8//TRANSLIT//IGNORE", encoding)
+ }
+ return setmetatable(conv, converter_mt)
+end
+
+--- Retrieve a tongue encoding converter based on the environment.
+--
+-- Firstly this function attempts to determine the encoding desired by the
+-- "client" by means of examining the provided environment table (or the
+-- process environment table if none was given). Once an encoding has been
+-- determined somehow, tongue will return an encoder by calling through to
+-- the @{get} function.
+--
+-- If no encoding can be determined from the provided table, tongue will assume
+-- that UTF-8 is appropriate.
+--
+-- @tparam ?table env The environment to use (or nil to use the process env)
+-- @treturn encoder The bidirectional character encoder
+-- @function guess
+local function guess_converter(env)
+ local function getenv(k)
+ if env then
+ return env[k] or ""
+ else
+ return os.getenv(k) or ""
+ end
+ end
+ -- glibc's approach is first to look at LC_ALL, then failing that
+ -- LC_MESSAGES, and failing that, LANG. (Well, LANG is considered first as
+ -- a fallback approach, but considering we're just hunting for an encoding
+ -- it'll be okay to consider it last)
+ local category = getenv "LC_ALL"
+ if category and category ~= "" then
+ local _, __, enc = util.split_category(category)
+ if enc == "" then enc = nil end
+ return get_converter(enc or "UTF-8")
+ end
+ category = getenv "LC_MESSAGES"
+ if category and category ~= "" then
+ local _, __, enc = util.split_category(category)
+ if enc == "" then enc = nil end
+ return get_converter(enc or "UTF-8")
+ end
+ category = getenv "LANG"
+ if category and category ~= "" then
+ local _, __, enc = util.split_category(category)
+ if enc == "" then enc = nil end
+ return get_converter(enc or "UTF-8")
+ end
+ return get_converter("UTF-8")
+end
+
+return {
+ get = get_converter,
+ guess = guess_converter,
+}
+
diff --git a/lib/tongue/util.lua b/lib/tongue/util.lua
new file mode 100644
index 0000000..0c2fc73
--- /dev/null
+++ b/lib/tongue/util.lua
@@ -0,0 +1,42 @@
+-- lib/tongue/util.lua
+--
+-- Lua I18N library 'Tongue' - Utility routines
+--
+-- Copyright 2016 Daniel Silverstone <dsilvers@digital-scurf.org>
+--
+-- For licence terms, see COPYING
+--
+
+--- Tongue needs to process lots of interesting data, here are some utility
+-- functions it might use.
+--
+-- @module tongue.util
+
+--- Split a category into its components.
+--
+-- Categories can have low level languages, sub languages (countries), and
+-- character sets. This routine splits a category string in the same way as
+-- glibc does which seems to be a reasonable 'standard' to use.
+--
+-- @tparam string category The category to split up
+-- @treturn string The base (low level) language name.
+-- @treturn ?string The sub-language (or nil if no sub-language was specified)
+-- @treturn ?string The characterset (or nil if no character set was specified)
+-- @function split_category
+local function split_category(category)
+ -- ll_CC.SSS
+ local charset = nil
+ if category:find("%.") then
+ category, charset = category:match("^(.-)%.(.+)$")
+ end
+ local country = nil
+ if category:find("_") then
+ category, country = category:match("^(.-)_(.+)$")
+ end
+ return category, country, charset
+end
+
+return {
+ split_category = split_category,
+}
+
diff --git a/test/test-tongue.lua b/test/test-tongue.lua
index fc77e66..6cfad31 100644
--- a/test/test-tongue.lua
+++ b/test/test-tongue.lua
@@ -47,6 +47,16 @@ function suite.langpack_module_present()
assert(tongue.langpack, "Language pack module not present")
end
+function suite.transliteration_module_present()
+ assert(tongue.transliteration, "Transliteration module not present")
+ assert(tongue.translit, "Transliteration module not present")
+ assert(tongue.t13n, "Transliteration module not present")
+end
+
+function suite.util_module_present()
+ assert(tongue.util, "Util module not present")
+end
+
local count_ok = 0
for _, testname in ipairs(testnames) do
-- print("Run: " .. testname)