在线文档教程
Ruby 2.4

Encoding::Converter

class Encoding::Converter

Parent:Data

常量

AFTER_OUTPUT

AFTER_OUTPUT

在某些输出完成但在所有输入消耗完之前停止转换。有关示例,请参阅#primitive_convert。

CRLF_NEWLINE_DECORATOR

CRLF_NEWLINE_DECORATOR

用于将LF转换为CRLF的装饰器

CR_NEWLINE_DECORATOR

CR_NEWLINE_DECORATOR

用于将LF转换为CR的装饰器

INVALID_MASK

INVALID_MASK

用于无效字节序列的掩码

INVALID_REPLACE

INVALID_REPLACE

替换无效的字节序列

PARTIAL_INPUT

PARTIAL_INPUT

指示:源可能是较大字符串的一部分。有关示例,请参阅#primitive_convert。

UNDEF_HEX_CHARREF

UNDEF_HEX_CHARREF

将目标编码中未定义的字节序列替换为XML十六进制字符引用。这对XML转换有效。

UNDEF_MASK

UNDEF_MASK

掩码为源编码中的有效字符,但目标编码中不包含相关字符。

UNDEF_REPLACE

UNDEF_REPLACE

替换目标编码中未定义的字节序列。

UNIVERSAL_NEWLINE_DECORATOR

UNIVERSAL_NEWLINE_DECORATOR

用于将CRLF和CR转换为LF的装饰器

XML_ATTR_CONTENT_DECORATOR

XML_ATTR_CONTENT_DECORATOR

转义为XML AttValue

XML_ATTR_QUOTE_DECORATOR

XML_ATTR_QUOTE_DECORATOR

转义为XML AttValue

XML_TEXT_DECORATOR

XML_TEXT_DECORATOR

转义为XML CharData

公共类方法

Encoding::Converter.asciicompat_encoding(string) → encoding or nil Show source

Encoding::Converter.asciicompat_encoding(encoding) → encoding or nil

返回相应的ASCII兼容编码。

如果参数是ASCII兼容编码,则返回零。

“对应的ASCII兼容编码”是ASCII兼容编码,其可以表示与给定的ASCII不兼容编码完全相同的字符。所以,在两种编码之间转换时不会发生未定义的转换错误。

Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP> Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8> Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil

static VALUE econv_s_asciicompat_encoding(VALUE klass, VALUE arg) { const char *arg_name, *result_name; rb_encoding *arg_enc, *result_enc; enc_arg(&arg, &arg_name, &arg_enc result_name = rb_econv_asciicompat_encoding(arg_name if (result_name == NULL) return Qnil; result_enc = make_encoding(result_name return rb_enc_from_encoding(result_enc }

Encoding::Converter.new(source_encoding, destination_encoding) Show source

Encoding::Converter.new(source_encoding, destination_encoding, opt)

Encoding::Converter.new(convpath)

可能的选项元素:

hash form: :invalid => nil # raise error on invalid byte sequence (default) :invalid => :replace # replace invalid byte sequence :undef => nil # raise error on undefined conversion (default) :undef => :replace # replace undefined conversion :replace => string # replacement string ("?" or "\uFFFD" if not specified) :newline => :universal # decorator for converting CRLF and CR to LF :newline => :crlf # decorator for converting LF to CRLF :newline => :cr # decorator for converting LF to CR :universal_newline => true # decorator for converting CRLF and CR to LF :crlf_newline => true # decorator for converting LF to CRLF :cr_newline => true # decorator for converting LF to CR :xml => :text # escape as XML CharData. :xml => :attr # escape as XML AttValue integer form: Encoding::Converter::INVALID_REPLACE Encoding::Converter::UNDEF_REPLACE Encoding::Converter::UNDEF_HEX_CHARREF Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR Encoding::Converter::CRLF_NEWLINE_DECORATOR Encoding::Converter::CR_NEWLINE_DECORATOR Encoding::Converter::XML_TEXT_DECORATOR Encoding::Converter::XML_ATTR_CONTENT_DECORATOR Encoding::Converter::XML_ATTR_QUOTE_DECORATOR

::new creates an instance of Encoding::Converter.

Source_encoding和#destination_encoding应该是一个字符串或Encoding对象。

opt应该是零,散列或整数。

convpath应该是一个数组。它可能包含:

  • 包含编码或编码名称的两元素数组,或是:

  • 表示装饰器名称的字符串。

:: new可选地带有一个选项。该选项应该是散列或整数。选项hash可以包含:invalid => nil等。选项integer应该是逻辑或常量,例如Encoding :: Converter :: INVALID_REPLACE等。

:invalid => nil

在无效字节序列上引发错误,这会是一个默认行为。

:invalid => :replace

用替换字符串替换无效字节序列。

:undef => nil

如果#source_encoding中的字符未在destination_encoding中定义,则引发错误。这是一个默认行为。

:undef => :replace

用替换字符串替换#destination_encoding中的未定义字符。

:replace => string

指定替换字符串。如果未指定,则对Unicode编码使用“uFFFD”,对其他使用“?”。

:universal_newline => true

将CRLF和CR转换为LF。

:crlf_newline => true

将LF转换为CRLF。

:cr_newline => true

将LF转换为CR。

:xml => :text

作为XML CharData转义。此表单可以用作HTML 4.0 #PCDATA。

  • '&' -> '&'

  • '<' -> '<'

  • '>' -> '>'

  • #destination_encoding中的未定义字符 - >十六进制CharRef,例如&#xHH;

:xml => :attr

转义为XML AttValue。转换后的结果被引用为“...”。此表单可以用作HTML 4.0属性值。

  • '&' -> '&'

  • '<' -> '<'

  • '>' -> '>'

  • '“' -> '"'

  • undefined characters in#destination_encoding-> hexadecimal CharRef 例如&#xHH;

  • 在源缓冲区(:invalid_byte_sequence)primitive_errinfolast_error方法中找到的无效字节序列返回错误的详细信息。

  • 源缓冲区意外结束(:incomplete_input)只有在未指定partial_input时才会发生。primitive_errinfo并且last_error方法返回错误的详细信息。

  • 字符不能表示在输出编码(:undefined_conversion)primitive_errinfolast_error方法返回错误的细节。

  • 在生成了一些输出之后,在输入完成之前(:after_output),只有在指定after_output时才会发生这种情况。

  • 目标缓冲区已满(:destination_buffer_full)只有在destination_bytesize非零时才会发生。

  • 源缓冲区为空(:source_buffer_empty),仅当指定partial_input时才会发生。

  • 转换完成(:完成)

例:

ec = Encoding::Converter.new("UTF-8", "UTF-16BE") ret = ec.primitive_convert(src="pi", dst="", nil, 100) p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"] ec = Encoding::Converter.new("UTF-8", "UTF-16BE") ret = ec.primitive_convert(src="pi", dst="", nil, 1) p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"] ret = ec.primitive_convert(src, dst="", nil, 1) p [ret, src, dst] #=> [:destination_buffer_full, "", "p"] ret = ec.primitive_convert(src, dst="", nil, 1) p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"] ret = ec.primitive_convert(src, dst="", nil, 1) p [ret, src, dst] #=> [:finished, "", "i"]

static VALUE econv_primitive_convert(int argc, VALUE *argv, VALUE self) { VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v; rb_econv_t *ec = check_econv(self rb_econv_result_t res; const unsigned char *ip, *is; unsigned char *op, *os; long output_byteoffset, output_bytesize; unsigned long output_byteend; int flags; argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt if (NIL_P(output_byteoffset_v)) output_byteoffset = 0; /* dummy */ else output_byteoffset = NUM2LONG(output_byteoffset_v if (NIL_P(output_bytesize_v)) output_bytesize = 0; /* dummy */ else output_bytesize = NUM2LONG(output_bytesize_v if (!NIL_P(flags_v)) { if (!NIL_P(opt)) { rb_error_arity(argc + 1, 2, 5 } flags = NUM2INT(rb_to_int(flags_v) } else if (!NIL_P(opt)) { VALUE v; flags = 0; v = rb_hash_aref(opt, sym_partial_input if (RTEST(v)) flags |= ECONV_PARTIAL_INPUT; v = rb_hash_aref(opt, sym_after_output if (RTEST(v)) flags |= ECONV_AFTER_OUTPUT; } else { flags = 0; } StringValue(output if (!NIL_P(input)) StringValue(input rb_str_modify(output if (NIL_P(output_bytesize_v)) { output_bytesize = RSTRING_EMBED_LEN_MAX; if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input)) output_bytesize = RSTRING_LEN(input } retry: if (NIL_P(output_byteoffset_v)) output_byteoffset = RSTRING_LEN(output if (output_byteoffset < 0) rb_raise(rb_eArgError, "negative output_byteoffset" if (RSTRING_LEN(output) < output_byteoffset) rb_raise(rb_eArgError, "output_byteoffset too big" if (output_bytesize < 0) rb_raise(rb_eArgError, "negative output_bytesize" output_byteend = (unsigned long)output_byteoffset + (unsigned long)output_bytesize; if (output_byteend < (unsigned long)output_byteoffset || LONG_MAX < output_byteend) rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big" if (rb_str_capacity(output) < output_byteend) rb_str_resize(output, output_byteend if (NIL_P(input)) { ip = is = NULL; } else { ip = (const unsigned char *)RSTRING_PTR(input is = ip + RSTRING_LEN(input } op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset; os = op + output_bytesize; res = rb_econv_convert(ec, &ip, is, &op, os, flags rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output) if (!NIL_P(input)) { OBJ_INFECT_RAW(output, input rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input) } if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) { if (LONG_MAX / 2 < output_bytesize) rb_raise(rb_eArgError, "too long conversion result" output_bytesize *= 2; output_byteoffset_v = Qnil; goto retry; } if (ec->destination_encoding) { rb_enc_associate(output, ec->destination_encoding } return econv_result_to_symbol(res }

primitive_errinfo → array Show source

#primitive_errinfo将有关上次错误的重要信息作为5元素数组返回:

[result, enc1, enc2, error_bytes, readagain_bytes]

结果是primitive_convert的最后结果。

其他元素仅在结果为invalid_byte_sequence,:incomplete_input或undefined_conversion时才有意义。

enc1和enc2将转换步骤表示为一对字符串。例如,从EUC-JP到ISO-8859-1的转换器按如下方式转换字符串:EUC-JP - > UTF-8 - > ISO-8859-1。因此enc1,enc2是“EUC-JP”,“UTF-8”或“UTF-8”,“ISO-8859-1”。

error_bytes和readagain_bytes指示导致错误的字节序列。error_bytes是丢弃的部分。readagain_bytes是缓冲部分,在下次转换时会再次读取。

Example:

# \xff is invalid as EUC-JP. ec = Encoding::Converter.new("EUC-JP", "Shift_JIS") ec.primitive_convert(src="\xff", dst="", nil, 10) p ec.primitive_errinfo #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""] # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1. # Since this error is occur in UTF-8 to ISO-8859-1 conversion, # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82). ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10) p ec.primitive_errinfo #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""] # partial character is invalid ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") ec.primitive_convert(src="\xa4", dst="", nil, 10) p ec.primitive_errinfo #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""] # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by # partial characters. ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1") ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT) p ec.primitive_errinfo #=> [:source_buffer_empty, nil, nil, nil, nil] # \xd8\x00\x00@ is invalid as UTF-16BE because # no low surrogate after high surrogate (\xd8\x00). # It is detected by 3rd byte (\00) which is part of next character. # So the high surrogate (\xd8\x00) is discarded and # the 3rd byte is read again later. # Since the byte is buffered in ec, it is dropped from src. ec = Encoding::Converter.new("UTF-16BE", "UTF-8") ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10) p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"] p src #=> "@" # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE. # The problem is detected by 4th byte. ec = Encoding::Converter.new("UTF-16LE", "UTF-8") ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10) p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"] p src #=> ""

static VALUE econv_primitive_errinfo(VALUE self) { rb_econv_t *ec = check_econv(self VALUE ary; ary = rb_ary_new2(5 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result) rb_ary_store(ary, 4, Qnil if (ec->last_error.source_encoding) rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding) if (ec->last_error.destination_encoding) rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding) if (ec->last_error.error_bytes_start) { rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len) rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len) } return ary; }

putback(p1 = v1) Show source

call-seq

ec.putback -> string ec.putback(max_numbytes) -> string

放回将被转换的字节。

这些字节是由invalid_byte_sequence错误引起的。当invalid_byte_sequence错误时,一些字节被丢弃,一些字节被缓冲以后转换。后面的字节可以放回去。可以通过Encoding :: InvalidByteSequenceError#readagain_bytes和#primitive_errinfo来观察。

ec = Encoding::Converter.new("utf-16le", "iso-8859-1") src = "\x00\xd8\x61\x00" dst = "" p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"] p ec.putback #=> "a\x00" p ec.putback #=> "" # no more bytes to put back

static VALUE econv_putback(int argc, VALUE *argv, VALUE self) { rb_econv_t *ec = check_econv(self int n; int putbackable; VALUE str, max; rb_scan_args(argc, argv, "01", &max if (NIL_P(max)) n = rb_econv_putbackable(ec else { n = NUM2INT(max putbackable = rb_econv_putbackable(ec if (putbackable < n) n = putbackable; } str = rb_str_new(NULL, n rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n if (ec->source_encoding) { rb_enc_associate(str, ec->source_encoding } return str; }

replacement → string Show source

返回替换字符串。

ec = Encoding::Converter.new("euc-jp", "us-ascii") p ec.replacement #=> "?" ec = Encoding::Converter.new("euc-jp", "utf-8") p ec.replacement #=> "\uFFFD"

static VALUE econv_get_replacement(VALUE self) { rb_econv_t *ec = check_econv(self int ret; rb_encoding *enc; ret = make_replacement(ec if (ret == -1) { rb_raise(rb_eUndefinedConversionError, "replacement character setup failed" } enc = rb_enc_find(ec->replacement_enc return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc }

replacement = string显示源文件

设置替换字符串。

ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace) ec.replacement = "<undef>" p ec.convert("a \u3042 b") #=> "a <undef> b"

static VALUE econv_set_replacement(VALUE self, VALUE arg) { rb_econv_t *ec = check_econv(self VALUE string = arg; int ret; rb_encoding *enc; StringValue(string enc = rb_enc_get(string ret = rb_econv_set_replacement(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), rb_enc_name(enc) if (ret == -1) { /* xxx: rb_eInvalidByteSequenceError? */ rb_raise(rb_eUndefinedConversionError, "replacement character setup failed" } return arg; }

source_encoding→encoding 显示源文件

以编码对象的形式返回源编码。

static VALUE econv_source_encoding(VALUE self) { rb_econv_t *ec = check_econv(self if (!ec->source_encoding) return Qnil; return rb_enc_from_encoding(ec->source_encoding }