Encoding::Converter
class Encoding::Converter
Parent:Data
常量
AFTER_OUTPUT
AFTER_OUTPUT
在某些输出完成但在所有输入消耗完之前停止转换。有关示例,请参阅#primitive_convert。
CRLF_NEWLINE_DECORATOR
CRLF_NEWLINE_DECORATOR
用于将LF转换为CRLF的装饰器
CR_NEWLINE_DECORATOR
CR_NEWLINE_DECORATOR
用于将LF转换为CR的装饰器
INVALID_MASK
INVALID_MASK
用于无效字节序列的掩码
INVALID_REPLACE
INVALID_REPLACE
替换无效的字节序列
PARTIAL_INPUT
PARTIAL_INPUT
指示:源可能是较大字符串的一部分。有关示例,请参阅#primitive_convert。
UNDEF_HEX_CHARREF
UNDEF_HEX_CHARREF
将目标编码中未定义的字节序列替换为XML十六进制字符引用。这对XML转换有效。
UNDEF_MASK
UNDEF_MASK
掩码为源编码中的有效字符,但目标编码中不包含相关字符。
UNDEF_REPLACE
UNDEF_REPLACE
替换目标编码中未定义的字节序列。
UNIVERSAL_NEWLINE_DECORATOR
UNIVERSAL_NEWLINE_DECORATOR
用于将CRLF和CR转换为LF的装饰器
XML_ATTR_CONTENT_DECORATOR
XML_ATTR_CONTENT_DECORATOR
转义为XML AttValue
XML_ATTR_QUOTE_DECORATOR
XML_ATTR_QUOTE_DECORATOR
转义为XML AttValue
XML_TEXT_DECORATOR
XML_TEXT_DECORATOR
转义为XML CharData
公共类方法
Encoding::Converter.asciicompat_encoding(string) → encoding or nil Show source
Encoding::Converter.asciicompat_encoding(encoding) → encoding or nil
返回相应的ASCII兼容编码。
如果参数是ASCII兼容编码,则返回零。
“对应的ASCII兼容编码”是ASCII兼容编码,其可以表示与给定的ASCII不兼容编码完全相同的字符。所以,在两种编码之间转换时不会发生未定义的转换错误。
Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
static VALUE
econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
{
const char *arg_name, *result_name;
rb_encoding *arg_enc, *result_enc;
enc_arg(&arg, &arg_name, &arg_enc
result_name = rb_econv_asciicompat_encoding(arg_name
if (result_name == NULL)
return Qnil;
result_enc = make_encoding(result_name
return rb_enc_from_encoding(result_enc
}
Encoding::Converter.new(source_encoding, destination_encoding) Show source
Encoding::Converter.new(source_encoding, destination_encoding, opt)
Encoding::Converter.new(convpath)
可能的选项元素:
hash form:
:invalid => nil # raise error on invalid byte sequence (default)
:invalid => :replace # replace invalid byte sequence
:undef => nil # raise error on undefined conversion (default)
:undef => :replace # replace undefined conversion
:replace => string # replacement string ("?" or "\uFFFD" if not specified)
:newline => :universal # decorator for converting CRLF and CR to LF
:newline => :crlf # decorator for converting LF to CRLF
:newline => :cr # decorator for converting LF to CR
:universal_newline => true # decorator for converting CRLF and CR to LF
:crlf_newline => true # decorator for converting LF to CRLF
:cr_newline => true # decorator for converting LF to CR
:xml => :text # escape as XML CharData.
:xml => :attr # escape as XML AttValue
integer form:
Encoding::Converter::INVALID_REPLACE
Encoding::Converter::UNDEF_REPLACE
Encoding::Converter::UNDEF_HEX_CHARREF
Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
Encoding::Converter::CRLF_NEWLINE_DECORATOR
Encoding::Converter::CR_NEWLINE_DECORATOR
Encoding::Converter::XML_TEXT_DECORATOR
Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
::new creates an instance of Encoding::Converter.
Source_encoding和#destination_encoding应该是一个字符串或Encoding对象。
opt应该是零,散列或整数。
convpath应该是一个数组。它可能包含:
- 包含编码或编码名称的两元素数组,或是:
- 表示装饰器名称的字符串。
:: new可选地带有一个选项。该选项应该是散列或整数。选项hash可以包含:invalid => nil等。选项integer应该是逻辑或常量,例如Encoding :: Converter :: INVALID_REPLACE等。
:invalid => nil
在无效字节序列上引发错误,这会是一个默认行为。
:invalid => :replace
用替换字符串替换无效字节序列。
:undef => nil
如果#source_encoding中的字符未在destination_encoding中定义,则引发错误。这是一个默认行为。
:undef => :replace
用替换字符串替换#destination_encoding中的未定义字符。
:replace => string
指定替换字符串。如果未指定,则对Unicode编码使用“uFFFD”,对其他使用“?”。
:universal_newline => true
将CRLF和CR转换为LF。
:crlf_newline => true
将LF转换为CRLF。
:cr_newline => true
将LF转换为CR。
:xml => :text
作为XML CharData转义。此表单可以用作HTML 4.0 #PCDATA。
- '&' -> '&'
- '<' -> '<'
- '>' -> '>'
- #destination_encoding中的未定义字符 - >十六进制CharRef,例如&#xHH;
:xml => :attr
转义为XML AttValue。转换后的结果被引用为“...”。此表单可以用作HTML 4.0属性值。
- '&' -> '&'
- '<' -> '<'
- '>' -> '>'
- '“' -> '"'
- undefined characters in#destination_encoding-> hexadecimal CharRef 例如&#xHH;
- 在源缓冲区(:invalid_byte_sequence)
primitive_errinfo
和last_error
方法中找到的无效字节序列返回错误的详细信息。
- 源缓冲区意外结束(:incomplete_input)只有在未指定partial_input时才会发生。
primitive_errinfo
并且last_error
方法返回错误的详细信息。
- 字符不能表示在输出编码(:undefined_conversion)
primitive_errinfo
和last_error
方法返回错误的细节。
- 在生成了一些输出之后,在输入完成之前(:after_output),只有在指定after_output时才会发生这种情况。
- 目标缓冲区已满(:destination_buffer_full)只有在destination_bytesize非零时才会发生。
- 源缓冲区为空(:source_buffer_empty),仅当指定partial_input时才会发生。
- 转换完成(:完成)
例:
ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
ret = ec.primitive_convert(src="pi", dst="", nil, 100)
p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
ret = ec.primitive_convert(src="pi", dst="", nil, 1)
p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
ret = ec.primitive_convert(src, dst="", nil, 1)
p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
ret = ec.primitive_convert(src, dst="", nil, 1)
p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
ret = ec.primitive_convert(src, dst="", nil, 1)
p [ret, src, dst] #=> [:finished, "", "i"]
static VALUE
econv_primitive_convert(int argc, VALUE *argv, VALUE self)
{
VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
rb_econv_t *ec = check_econv(self
rb_econv_result_t res;
const unsigned char *ip, *is;
unsigned char *op, *os;
long output_byteoffset, output_bytesize;
unsigned long output_byteend;
int flags;
argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt
if (NIL_P(output_byteoffset_v))
output_byteoffset = 0; /* dummy */
else
output_byteoffset = NUM2LONG(output_byteoffset_v
if (NIL_P(output_bytesize_v))
output_bytesize = 0; /* dummy */
else
output_bytesize = NUM2LONG(output_bytesize_v
if (!NIL_P(flags_v)) {
if (!NIL_P(opt)) {
rb_error_arity(argc + 1, 2, 5
}
flags = NUM2INT(rb_to_int(flags_v)
}
else if (!NIL_P(opt)) {
VALUE v;
flags = 0;
v = rb_hash_aref(opt, sym_partial_input
if (RTEST(v))
flags |= ECONV_PARTIAL_INPUT;
v = rb_hash_aref(opt, sym_after_output
if (RTEST(v))
flags |= ECONV_AFTER_OUTPUT;
}
else {
flags = 0;
}
StringValue(output
if (!NIL_P(input))
StringValue(input
rb_str_modify(output
if (NIL_P(output_bytesize_v)) {
output_bytesize = RSTRING_EMBED_LEN_MAX;
if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
output_bytesize = RSTRING_LEN(input
}
retry:
if (NIL_P(output_byteoffset_v))
output_byteoffset = RSTRING_LEN(output
if (output_byteoffset < 0)
rb_raise(rb_eArgError, "negative output_byteoffset"
if (RSTRING_LEN(output) < output_byteoffset)
rb_raise(rb_eArgError, "output_byteoffset too big"
if (output_bytesize < 0)
rb_raise(rb_eArgError, "negative output_bytesize"
output_byteend = (unsigned long)output_byteoffset +
(unsigned long)output_bytesize;
if (output_byteend < (unsigned long)output_byteoffset ||
LONG_MAX < output_byteend)
rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big"
if (rb_str_capacity(output) < output_byteend)
rb_str_resize(output, output_byteend
if (NIL_P(input)) {
ip = is = NULL;
}
else {
ip = (const unsigned char *)RSTRING_PTR(input
is = ip + RSTRING_LEN(input
}
op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
os = op + output_bytesize;
res = rb_econv_convert(ec, &ip, is, &op, os, flags
rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output)
if (!NIL_P(input)) {
OBJ_INFECT_RAW(output, input
rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input)
}
if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
if (LONG_MAX / 2 < output_bytesize)
rb_raise(rb_eArgError, "too long conversion result"
output_bytesize *= 2;
output_byteoffset_v = Qnil;
goto retry;
}
if (ec->destination_encoding) {
rb_enc_associate(output, ec->destination_encoding
}
return econv_result_to_symbol(res
}
primitive_errinfo → array Show source
#primitive_errinfo将有关上次错误的重要信息作为5元素数组返回:
[result, enc1, enc2, error_bytes, readagain_bytes]
结果是primitive_convert的最后结果。
其他元素仅在结果为invalid_byte_sequence,:incomplete_input或undefined_conversion时才有意义。
enc1和enc2将转换步骤表示为一对字符串。例如,从EUC-JP到ISO-8859-1的转换器按如下方式转换字符串:EUC-JP - > UTF-8 - > ISO-8859-1。因此enc1,enc2是“EUC-JP”,“UTF-8”或“UTF-8”,“ISO-8859-1”。
error_bytes和readagain_bytes指示导致错误的字节序列。error_bytes是丢弃的部分。readagain_bytes是缓冲部分,在下次转换时会再次读取。
Example:
# \xff is invalid as EUC-JP.
ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
ec.primitive_convert(src="\xff", dst="", nil, 10)
p ec.primitive_errinfo
#=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""]
# HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
# Since this error is occur in UTF-8 to ISO-8859-1 conversion,
# error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
p ec.primitive_errinfo
#=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
# partial character is invalid
ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
ec.primitive_convert(src="\xa4", dst="", nil, 10)
p ec.primitive_errinfo
#=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
# Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
# partial characters.
ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
p ec.primitive_errinfo
#=> [:source_buffer_empty, nil, nil, nil, nil]
# \xd8\x00\x00@ is invalid as UTF-16BE because
# no low surrogate after high surrogate (\xd8\x00).
# It is detected by 3rd byte (\00) which is part of next character.
# So the high surrogate (\xd8\x00) is discarded and
# the 3rd byte is read again later.
# Since the byte is buffered in ec, it is dropped from src.
ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
p ec.primitive_errinfo
#=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
p src
#=> "@"
# Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
# The problem is detected by 4th byte.
ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
p ec.primitive_errinfo
#=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
p src
#=> ""
static VALUE
econv_primitive_errinfo(VALUE self)
{
rb_econv_t *ec = check_econv(self
VALUE ary;
ary = rb_ary_new2(5
rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result)
rb_ary_store(ary, 4, Qnil
if (ec->last_error.source_encoding)
rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding)
if (ec->last_error.destination_encoding)
rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding)
if (ec->last_error.error_bytes_start) {
rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len)
rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len)
}
return ary;
}
putback(p1 = v1) Show source
call-seq
ec.putback -> string
ec.putback(max_numbytes) -> string
放回将被转换的字节。
这些字节是由invalid_byte_sequence错误引起的。当invalid_byte_sequence错误时,一些字节被丢弃,一些字节被缓冲以后转换。后面的字节可以放回去。可以通过Encoding :: InvalidByteSequenceError#readagain_bytes和#primitive_errinfo来观察。
ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
src = "\x00\xd8\x61\x00"
dst = ""
p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
p ec.putback #=> "a\x00"
p ec.putback #=> "" # no more bytes to put back
static VALUE
econv_putback(int argc, VALUE *argv, VALUE self)
{
rb_econv_t *ec = check_econv(self
int n;
int putbackable;
VALUE str, max;
rb_scan_args(argc, argv, "01", &max
if (NIL_P(max))
n = rb_econv_putbackable(ec
else {
n = NUM2INT(max
putbackable = rb_econv_putbackable(ec
if (putbackable < n)
n = putbackable;
}
str = rb_str_new(NULL, n
rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n
if (ec->source_encoding) {
rb_enc_associate(str, ec->source_encoding
}
return str;
}
replacement → string Show source
返回替换字符串。
ec = Encoding::Converter.new("euc-jp", "us-ascii")
p ec.replacement #=> "?"
ec = Encoding::Converter.new("euc-jp", "utf-8")
p ec.replacement #=> "\uFFFD"
static VALUE
econv_get_replacement(VALUE self)
{
rb_econv_t *ec = check_econv(self
int ret;
rb_encoding *enc;
ret = make_replacement(ec
if (ret == -1) {
rb_raise(rb_eUndefinedConversionError, "replacement character setup failed"
}
enc = rb_enc_find(ec->replacement_enc
return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc
}
replacement = string显示源文件
设置替换字符串。
ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
ec.replacement = "<undef>"
p ec.convert("a \u3042 b") #=> "a <undef> b"
static VALUE
econv_set_replacement(VALUE self, VALUE arg)
{
rb_econv_t *ec = check_econv(self
VALUE string = arg;
int ret;
rb_encoding *enc;
StringValue(string
enc = rb_enc_get(string
ret = rb_econv_set_replacement(ec,
(const unsigned char *)RSTRING_PTR(string),
RSTRING_LEN(string),
rb_enc_name(enc)
if (ret == -1) {
/* xxx: rb_eInvalidByteSequenceError? */
rb_raise(rb_eUndefinedConversionError, "replacement character setup failed"
}
return arg;
}
source_encoding→encoding 显示源文件
以编码对象的形式返回源编码。
static VALUE
econv_source_encoding(VALUE self)
{
rb_econv_t *ec = check_econv(self
if (!ec->source_encoding)
return Qnil;
return rb_enc_from_encoding(ec->source_encoding
}