summaryrefslogtreecommitdiff
path: root/lib/tongue/transliteration.lua
blob: a2266d24eb11d229d557e6d01d88006f0b64b3e4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
-- lib/tongue/transliteration.lua
--
-- Lua I18N library 'Tongue' - Transliteration of strings
--
-- Copyright 2016 Daniel Silverstone <dsilvers@digital-scurf.org>
--
-- For licence terms, see COPYING
--

--- Tongue language packs are internally always in UTF-8, but users may need
-- different encodings.
--
-- Since users might have all sorts of ways of specifying the desired character
-- encoding for their messages, Tongue provides a mechanism for deriving the
-- target character encoding and then transliterating to and from that
-- encoding.
--
-- @module tongue.transliteration

local iconv = require "iconv"
local util = require "tongue.util"

local converter = {}

--- Tongue character-set converter
--
-- Tongue deals internally in UTF-8 but may have to handle input and output
-- in any character set a user may choose.  The converter object wrappers
-- a pair of iconv descriptors which manage that conversion.
--
--
-- @type converter

--- Convert a string to the user character set.
--
-- @tparam string input The input (UTF-8) string
-- @treturn string The output (user charset) string
-- @function touser
function converter:touser(input)
   local s, v = self._touser:iconv(input)
   return s or ("Error " .. tostring(v))
end

--- Convert a string from the user character set.
--
-- @tparam string input The input (user charset) string
-- @treturn string The output (UTF-8) string
-- @function fromuser
function converter:fromuser(input)
   local s, v = self._fromuser:iconv(input)
   return s or ("Error " .. tostring(v))
end

local converter_mt = {__index=converter}

---
-- @section tongue.transliteration

--- Retrieve a tongue encoding converter.
--
-- Construct and return an encoder which can convert between the provided
-- encoding and UTF-8 in either direction.  The converter will be configured to
-- transliterate where possible and to replace bad or unknown codepoints so as
-- to ensure that the outputs are always valid.
--
-- If the desired encoding is UTF-8 then the encoder returned shall effectively
-- be a passthrough, excepting that invalid or malformed codepoints shall be
-- "cleaned up" by the encoder object.
--
-- @tparam string encoding The desired encoding to be used
-- @treturn encoder The bidirectional character encoder
-- @function get
local function get_converter(encoding)
   local conv = {
      _touser = iconv.open(encoding .. "//TRANSLIT//IGNORE", "UTF-8"),
      _fromuser = iconv.open("UTF-8//TRANSLIT//IGNORE", encoding)
   }
   return setmetatable(conv, converter_mt)
end

--- Retrieve a tongue encoding converter based on the environment.
--
-- Firstly this function attempts to determine the encoding desired by the
-- "client" by means of examining the provided environment table (or the
-- process environment table if none was given).  Once an encoding has been
-- determined somehow, tongue will return an encoder by calling through to
-- the @{get} function.
--
-- If no encoding can be determined from the provided table, tongue will assume
-- that UTF-8 is appropriate.
--
-- @tparam ?table env The environment to use (or nil to use the process env)
-- @treturn encoder The bidirectional character encoder
-- @function guess
local function guess_converter(env)
   local function getenv(k)
      if env then
	 return env[k] or ""
      else
	 return os.getenv(k) or ""
      end
   end
   -- glibc's approach is first to look at LC_ALL, then failing that
   -- LC_MESSAGES, and failing that, LANG. (Well, LANG is considered first as
   -- a fallback approach, but considering we're just hunting for an encoding
   -- it'll be okay to consider it last)
   local category = getenv "LC_ALL"
   if category and category ~= "" then
      local _, __, enc = util.split_category(category)
      if enc == "" then enc = nil end
      return get_converter(enc or "UTF-8"), category
   end
   category = getenv "LC_MESSAGES"
   if category and category ~= "" then
      local _, __, enc = util.split_category(category)
      if enc == "" then enc = nil end
      return get_converter(enc or "UTF-8"), category
   end
   category = getenv "LANG"
   if category and category ~= "" then
      local _, __, enc = util.split_category(category)
      if enc == "" then enc = nil end
      return get_converter(enc or "UTF-8"), category
   end
   return get_converter("UTF-8")
end

return {
   get = get_converter,
   guess = guess_converter,
}