From 4aad7e746fed2e41d560111f3c5a05193d1fcd8b Mon Sep 17 00:00:00 2001 From: Michael Rykov Date: Thu, 2 Dec 2021 23:41:22 +0800 Subject: [PATCH] encoding: publicly expose identifier.{MIB,Interface} --- encoding/ianaindex/ianaindex.go | 9 +++ encoding/ianaindex/ianaindex_test.go | 15 ++++ encoding/identifier/identifier.go | 79 ++++++++++++++++++++++ encoding/internal/identifier/identifier.go | 79 +++------------------- 4 files changed, 112 insertions(+), 70 deletions(-) create mode 100644 encoding/identifier/identifier.go diff --git a/encoding/ianaindex/ianaindex.go b/encoding/ianaindex/ianaindex.go index f4b18875c..4b17b35b1 100644 --- a/encoding/ianaindex/ianaindex.go +++ b/encoding/ianaindex/ianaindex.go @@ -104,6 +104,15 @@ func (x *Index) Name(e encoding.Encoding) (string, error) { return x.names(v), nil } +// FindMIB searches encoding by MIBenum identifier +func (x *Index) FindMIB(mib identifier.MIB) (encoding.Encoding, error) { + v := findMIB(x.toMIB, mib) + if v == -1 { + return nil, errUnsupported + } + return x.enc[v], nil +} + // TODO: the coverage of this index is rather spotty. Allowing users to set // encodings would allow: // - users to increase coverage diff --git a/encoding/ianaindex/ianaindex_test.go b/encoding/ianaindex/ianaindex_test.go index d545fcf23..49b68fce1 100644 --- a/encoding/ianaindex/ianaindex_test.go +++ b/encoding/ianaindex/ianaindex_test.go @@ -105,6 +105,21 @@ func TestEncoding(t *testing.T) { if got, err := tc.index.Name(enc); got != tc.canonical { t.Errorf("%d: Name(Encoding(%q)) = %q; want %q (%v)", i, tc.name, got, tc.canonical, err) } + + id, ok := enc.(identifier.Interface) + if !ok { + t.Errorf("%d: encoding %q has no ID", i, tc.name) + } + mib, _ := id.ID() + if mib == 0 { + t.Errorf("%d: encoding %q returned 0 MIB enum", i, tc.name) + } + mibEnc, err := tc.index.FindMIB(mib) + if err != nil { + t.Errorf("%d: FindMIB error %q", i, err) + } else if mibEnc != enc { + t.Errorf("%d: FindMIB did not match encoding", i) + } } } diff --git a/encoding/identifier/identifier.go b/encoding/identifier/identifier.go new file mode 100644 index 000000000..d09d343e8 --- /dev/null +++ b/encoding/identifier/identifier.go @@ -0,0 +1,79 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package identifier defines the contract between implementations of Encoding +// and Index by defining identifiers that uniquely identify standardized coded +// character sets (CCS) and character encoding schemes (CES), which we will +// together refer to as encodings, for which Encoding implementations provide +// converters to and from UTF-8. This package is typically only of concern to +// implementers of Indexes and Encodings. +// +// One part of the identifier is the MIB code, which is defined by IANA and +// uniquely identifies a CCS or CES. Each code is associated with data that +// references authorities, official documentation as well as aliases and MIME +// names. +// +// Not all CESs are covered by the IANA registry. The "other" string that is +// returned by ID can be used to identify other character sets or versions of +// existing ones. +// +// It is recommended that each package that provides a set of Encodings provide +// the All and Common variables to reference all supported encodings and +// commonly used subset. This allows Index implementations to include all +// available encodings without explicitly referencing or knowing about them. +package identifier + +// Note: this package is internal, but could be made public if there is a need +// for writing third-party Indexes and Encodings. + +// References: +// - http://source.icu-project.org/repos/icu/icu/trunk/source/data/mappings/convrtrs.txt +// - http://www.iana.org/assignments/character-sets/character-sets.xhtml +// - http://www.iana.org/assignments/ianacharset-mib/ianacharset-mib +// - http://www.ietf.org/rfc/rfc2978.txt +// - https://www.unicode.org/reports/tr22/ +// - http://www.w3.org/TR/encoding/ +// - https://encoding.spec.whatwg.org/ +// - https://encoding.spec.whatwg.org/encodings.json +// - https://tools.ietf.org/html/rfc6657#section-5 + +// Interface can be implemented by Encodings to define the CCS or CES for which +// it implements conversions. +type Interface interface { + // ID returns an encoding identifier. Exactly one of the mib and other + // values should be non-zero. + // + // In the usual case it is only necessary to indicate the MIB code. The + // other string can be used to specify encodings for which there is no MIB, + // such as "x-mac-dingbat". + // + // The other string may only contain the characters a-z, A-Z, 0-9, - and _. + ID() (mib MIB, other string) + + // NOTE: the restrictions on the encoding are to allow extending the syntax + // with additional information such as versions, vendors and other variants. +} + +// A MIB identifies an encoding. It is derived from the IANA MIB codes and adds +// some identifiers for some encodings that are not covered by the IANA +// standard. +// +// See http://www.iana.org/assignments/ianacharset-mib. +type MIB uint16 + +// These additional MIB types are not defined in IANA. They are added because +// they are common and defined within the text repo. +const ( + // Unofficial marks the start of encodings not registered by IANA. + Unofficial MIB = 10000 + iota + + // Replacement is the WhatWG replacement encoding. + Replacement + + // XUserDefined is the code for x-user-defined. + XUserDefined + + // MacintoshCyrillic is the code for x-mac-cyrillic. + MacintoshCyrillic +) diff --git a/encoding/internal/identifier/identifier.go b/encoding/internal/identifier/identifier.go index 5c9b85c28..e3db97633 100644 --- a/encoding/internal/identifier/identifier.go +++ b/encoding/internal/identifier/identifier.go @@ -4,78 +4,17 @@ //go:generate go run gen.go -// Package identifier defines the contract between implementations of Encoding -// and Index by defining identifiers that uniquely identify standardized coded -// character sets (CCS) and character encoding schemes (CES), which we will -// together refer to as encodings, for which Encoding implementations provide -// converters to and from UTF-8. This package is typically only of concern to -// implementers of Indexes and Encodings. -// -// One part of the identifier is the MIB code, which is defined by IANA and -// uniquely identifies a CCS or CES. Each code is associated with data that -// references authorities, official documentation as well as aliases and MIME -// names. -// -// Not all CESs are covered by the IANA registry. The "other" string that is -// returned by ID can be used to identify other character sets or versions of -// existing ones. -// -// It is recommended that each package that provides a set of Encodings provide -// the All and Common variables to reference all supported encodings and -// commonly used subset. This allows Index implementations to include all -// available encodings without explicitly referencing or knowing about them. package identifier -// Note: this package is internal, but could be made public if there is a need -// for writing third-party Indexes and Encodings. - -// References: -// - http://source.icu-project.org/repos/icu/icu/trunk/source/data/mappings/convrtrs.txt -// - http://www.iana.org/assignments/character-sets/character-sets.xhtml -// - http://www.iana.org/assignments/ianacharset-mib/ianacharset-mib -// - http://www.ietf.org/rfc/rfc2978.txt -// - https://www.unicode.org/reports/tr22/ -// - http://www.w3.org/TR/encoding/ -// - https://encoding.spec.whatwg.org/ -// - https://encoding.spec.whatwg.org/encodings.json -// - https://tools.ietf.org/html/rfc6657#section-5 - -// Interface can be implemented by Encodings to define the CCS or CES for which -// it implements conversions. -type Interface interface { - // ID returns an encoding identifier. Exactly one of the mib and other - // values should be non-zero. - // - // In the usual case it is only necessary to indicate the MIB code. The - // other string can be used to specify encodings for which there is no MIB, - // such as "x-mac-dingbat". - // - // The other string may only contain the characters a-z, A-Z, 0-9, - and _. - ID() (mib MIB, other string) - - // NOTE: the restrictions on the encoding are to allow extending the syntax - // with additional information such as versions, vendors and other variants. -} - -// A MIB identifies an encoding. It is derived from the IANA MIB codes and adds -// some identifiers for some encodings that are not covered by the IANA -// standard. -// -// See http://www.iana.org/assignments/ianacharset-mib. -type MIB uint16 - -// These additional MIB types are not defined in IANA. They are added because -// they are common and defined within the text repo. -const ( - // Unofficial marks the start of encodings not registered by IANA. - Unofficial MIB = 10000 + iota +import ( + "golang.org/x/text/encoding/identifier" +) - // Replacement is the WhatWG replacement encoding. - Replacement +var Replacement = identifier.Replacement - // XUserDefined is the code for x-user-defined. - XUserDefined +type Interface = identifier.Interface +type MIB = identifier.MIB - // MacintoshCyrillic is the code for x-mac-cyrillic. - MacintoshCyrillic -) +var Unofficial = identifier.Unofficial +var MacintoshCyrillic = identifier.MacintoshCyrillic +var XUserDefined = identifier.XUserDefined