Skip to content

Commit 34cd935

Browse files
committed
- cxon: utf-8 validation with DFA (Höhrmann/Felker)
1 parent d62ae27 commit 34cd935

1 file changed

Lines changed: 39 additions & 1 deletion

File tree

src/cxon/lang/common/cio/char.hxx

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ namespace cxon { namespace cio { namespace chr {
305305
}
306306

307307
template <typename II>
308-
inline int utf8_check(II i, II e) {
308+
inline int utf8_check1(II i, II e) {
309309
// http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf
310310
// p41, Table 3-7. Well-Formed UTF-8 Byte Sequences
311311
CXON_ASSERT(i != e, "unexpected");
@@ -372,6 +372,44 @@ namespace cxon { namespace cio { namespace chr {
372372
return 0;
373373
}
374374

375+
namespace imp {
376+
// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
377+
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
378+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
379+
// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
380+
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
381+
// CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
382+
// IN THE SOFTWARE.
383+
static const unsigned char utf8_decode_dfa_[] = {
384+
11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
385+
11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
386+
11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
387+
11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11, 11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,
388+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
389+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
390+
8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
391+
10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
392+
/////////////////////////////////////////////////////////////////////////////////////////////////
393+
0,12,24,36,60,96,84,12,12,12,48,72,12,12,12,12, 12,12,12,12,12,12,12,12,12, 0,12,12,12,12,12, 0,
394+
12, 0,12,12,12,24,12,12,12,12,12,24,12,24,12,12, 12,12,12,12,12,12,12,24,12,12,12,12,12,24,12,12,
395+
12,12,12,12,12,24,12,12,12,12,12,12,12,12,12,36, 12,36,12,12,12,36,12,12,12,12,12,36,12,36,12,12,
396+
12,36,12,12,12,12,12,12,12,12,12,12,12
397+
};
398+
inline unsigned utf8_decode_(unsigned state, unsigned byte) {
399+
return utf8_decode_dfa_[256 + state + utf8_decode_dfa_[byte]];
400+
}
401+
}
402+
template <typename II>
403+
inline int utf8_check(II b, II e) {
404+
unsigned s = 0;
405+
for (II i = b; i != e; ++i) {
406+
s = imp::utf8_decode_(s, (unsigned char)*i);
407+
if (s == 0) return int(i + 1 - b);
408+
if (s == 12) return 0;
409+
}
410+
return 0;
411+
}
412+
375413
}}}
376414

377415
namespace cxon { namespace cio { namespace chr {

0 commit comments

Comments
 (0)