|
|
dd89bb |
/********************************************************/
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
/********************************************************/
|
|
|
dd89bb |
|
|
|
dd89bb |
#include <psxtypes/psxtypes.h>
|
|
|
dd89bb |
#include <ntapi/nt_status.h>
|
|
|
dd89bb |
#include <ntapi/nt_unicode.h>
|
|
|
dd89bb |
|
|
|
dd89bb |
/**
|
|
|
dd89bb |
* unofficial bit distribution table for comprehension purposes only
|
|
|
dd89bb |
*
|
|
|
dd89bb |
* scalar nickname utf-16 utf-8[0] utf-8[1] utf-8[2] utf-8[3]
|
|
|
dd89bb |
*
|
|
|
dd89bb |
* 00000000 7x 00000000 0xxxxxxx
|
|
|
dd89bb |
* 0xxxxxxx 0xxxxxxx
|
|
|
dd89bb |
*
|
|
|
dd89bb |
* 00000yyy 5y6x 00000yyy 110yyyyy 10xxxxxx
|
|
|
dd89bb |
* yyxxxxxx yyxxxxxx
|
|
|
dd89bb |
*
|
|
|
dd89bb |
* zzzzyyyy 4z6y6x zzzzyyyy 1110zzzz 10yyyyyy 10xxxxxx
|
|
|
dd89bb |
* yyxxxxxx yyxxxxxx
|
|
|
dd89bb |
*
|
|
|
dd89bb |
* 000uuuuu 5u4z6y6x 110110ww 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
|
|
|
dd89bb |
* zzzzyyyy wwzzzzyy
|
|
|
dd89bb |
* yyxxxxxx 110111yy
|
|
|
dd89bb |
* yyxxxxxx (where wwww = uuuuu - 1)
|
|
|
dd89bb |
*
|
|
|
dd89bb |
*
|
|
|
dd89bb |
* validation of utf-8
|
|
|
dd89bb |
*
|
|
|
dd89bb |
* from to utf-8[0] utf-8[1] utf-8[2] utf-8[3]
|
|
|
dd89bb |
*
|
|
|
dd89bb |
* 0x0000 0x007F 00..7F
|
|
|
dd89bb |
* 0x0080 0x07FF C2..DF 80..BF
|
|
|
dd89bb |
* 0x0800 0x0FFF E0 A0..BF 80..BF
|
|
|
dd89bb |
* 0x1000 0xCFFF E1..EC 80..BF 80..BF
|
|
|
dd89bb |
* 0xD000 0xD7FF ED 80..9F 80..BF
|
|
|
dd89bb |
* 0xE000 0xFFFF EE..EF 80..BF 80..BF
|
|
|
dd89bb |
* 0x10000 0x3FFFF F0 90..BF 80..BF 80..BF
|
|
|
dd89bb |
* 0x40000 0xFFFFF F1..F3 80..BF 80..BF 80..BF
|
|
|
dd89bb |
* 0x100000 0x10FFFF F4 80..8F 80..BF 80..BF
|
|
|
dd89bb |
*
|
|
|
dd89bb |
**/
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
#define __AVAILABLE_CODE_POINTS 0x110000
|
|
|
dd89bb |
|
|
|
dd89bb |
int __stdcall __ntapi_uc_get_code_point_byte_count_utf8(uint32_t code_point)
|
|
|
dd89bb |
{
|
|
|
dd89bb |
|
|
|
dd89bb |
if ((code_point >> 7) == 0)
|
|
|
dd89bb |
return 1;
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
else if ((code_point >> 11) == 0)
|
|
|
dd89bb |
return 2;
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
else if ((code_point >> 16) == 0)
|
|
|
dd89bb |
return 3;
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
else if ((code_point >> 21) == 0)
|
|
|
dd89bb |
return 4;
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
else
|
|
|
dd89bb |
return 0;
|
|
|
dd89bb |
}
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
int __stdcall __ntapi_uc_get_code_point_byte_count_utf16(uint32_t code_point)
|
|
|
dd89bb |
{
|
|
|
dd89bb |
|
|
|
dd89bb |
if ((code_point >> 16) == 0)
|
|
|
dd89bb |
return 2;
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
else if ((code_point >> 21) == 0)
|
|
|
dd89bb |
return 4;
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
else
|
|
|
dd89bb |
return 0;
|
|
|
dd89bb |
}
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
/**
|
|
|
dd89bb |
* following is a straight-forward implementation
|
|
|
dd89bb |
* of unicode conversion and validation (see also:
|
|
|
dd89bb |
* Table 3-7 of the Unicode Standard, version 6.2).
|
|
|
dd89bb |
*
|
|
|
dd89bb |
* the use of callbacks allows the validation
|
|
|
dd89bb |
* functions to be the basis of our utf-8 conversion
|
|
|
dd89bb |
* functions on the one hand, and the posix path arg
|
|
|
dd89bb |
* normalization routine on the other.
|
|
|
dd89bb |
**/
|
|
|
dd89bb |
|
|
|
dd89bb |
static int32_t __fastcall __default_callback_fn_utf8(nt_utf8_callback_args * args)
|
|
|
dd89bb |
{
|
|
|
dd89bb |
args->src += args->byte_count;
|
|
|
dd89bb |
return NT_STATUS_SUCCESS;
|
|
|
dd89bb |
}
|
|
|
dd89bb |
|
|
|
dd89bb |
int32_t __stdcall __ntapi_uc_validate_unicode_stream_utf8(
|
|
|
dd89bb |
__in const unsigned char * ch,
|
|
|
dd89bb |
__in size_t size_in_bytes __optional,
|
|
|
dd89bb |
__out size_t * code_points __optional,
|
|
|
dd89bb |
__out void ** addr_failed __optional,
|
|
|
dd89bb |
__in ntapi_uc_utf8_callback_fn ** callback_fn __optional,
|
|
|
dd89bb |
__in nt_utf8_callback_args * callback_args __optional)
|
|
|
dd89bb |
{
|
|
|
dd89bb |
const unsigned char * utf8;
|
|
|
dd89bb |
unsigned char * ch_boundary;
|
|
|
dd89bb |
unsigned char byte_count;
|
|
|
dd89bb |
size_t _code_points;
|
|
|
dd89bb |
|
|
|
dd89bb |
ntapi_uc_utf8_callback_fn * _callback_fn[5];
|
|
|
dd89bb |
nt_utf8_callback_args _callback_args;
|
|
|
dd89bb |
|
|
|
dd89bb |
if (!callback_fn) {
|
|
|
dd89bb |
_callback_fn[0] = __default_callback_fn_utf8;
|
|
|
dd89bb |
_callback_fn[1] = __default_callback_fn_utf8;
|
|
|
dd89bb |
_callback_fn[2] = __default_callback_fn_utf8;
|
|
|
dd89bb |
_callback_fn[3] = __default_callback_fn_utf8;
|
|
|
dd89bb |
_callback_fn[4] = __default_callback_fn_utf8;
|
|
|
dd89bb |
callback_fn = (ntapi_uc_utf8_callback_fn **)&_callback_fn;
|
|
|
dd89bb |
}
|
|
|
dd89bb |
|
|
|
dd89bb |
if (!callback_args) {
|
|
|
dd89bb |
callback_args = &_callback_args;
|
|
|
dd89bb |
callback_args->src = (unsigned char *)0;
|
|
|
dd89bb |
}
|
|
|
dd89bb |
|
|
|
dd89bb |
if (callback_args->src)
|
|
|
dd89bb |
ch = callback_args->src;
|
|
|
dd89bb |
else
|
|
|
dd89bb |
callback_args->src = ch;
|
|
|
dd89bb |
|
|
|
dd89bb |
if (size_in_bytes)
|
|
|
dd89bb |
ch_boundary = (unsigned char *)((uintptr_t)ch + size_in_bytes);
|
|
|
dd89bb |
else
|
|
|
dd89bb |
ch_boundary = (unsigned char *)(~0);
|
|
|
dd89bb |
|
|
|
dd89bb |
if (!code_points)
|
|
|
dd89bb |
code_points = &_code_points;
|
|
|
dd89bb |
|
|
|
dd89bb |
while ((ch < ch_boundary) && (*ch)) {
|
|
|
dd89bb |
utf8 = ch;
|
|
|
dd89bb |
byte_count = 0;
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
if (utf8[0] <= 0x7F)
|
|
|
dd89bb |
byte_count = 1;
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
else if ((++ch < ch_boundary)
|
|
|
dd89bb |
&& (utf8[0] >= 0xC2) && (utf8[0] <= 0xDF)
|
|
|
dd89bb |
&& (utf8[1] >= 0x80) && (utf8[1] <= 0xBF))
|
|
|
dd89bb |
byte_count = 2;
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
else if ((++ch < ch_boundary)
|
|
|
dd89bb |
&& (utf8[0] == 0xE0)
|
|
|
dd89bb |
&& (utf8[1] >= 0xA0) && (utf8[1] <= 0xBF)
|
|
|
dd89bb |
&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
|
|
|
dd89bb |
byte_count = 3;
|
|
|
dd89bb |
|
|
|
dd89bb |
else if (
|
|
|
dd89bb |
(utf8[0] >= 0xE1) && (utf8[0] <= 0xEC)
|
|
|
dd89bb |
&& (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
|
|
|
dd89bb |
&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
|
|
|
dd89bb |
byte_count = 3;
|
|
|
dd89bb |
|
|
|
dd89bb |
else if (
|
|
|
dd89bb |
(utf8[0] == 0xED)
|
|
|
dd89bb |
&& (utf8[1] >= 0x80) && (utf8[1] <= 0x9F)
|
|
|
dd89bb |
&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
|
|
|
dd89bb |
byte_count = 3;
|
|
|
dd89bb |
|
|
|
dd89bb |
else if (
|
|
|
dd89bb |
(utf8[0] >= 0xEE) && (utf8[0] <= 0xEF)
|
|
|
dd89bb |
&& (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
|
|
|
dd89bb |
&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
|
|
|
dd89bb |
byte_count = 3;
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
else if ((++ch < ch_boundary)
|
|
|
dd89bb |
&& (utf8[0] == 0xF0)
|
|
|
dd89bb |
&& (utf8[1] >= 0x90) && (utf8[1] <= 0xBF)
|
|
|
dd89bb |
&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
|
|
|
dd89bb |
&& (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
|
|
|
dd89bb |
byte_count = 4;
|
|
|
dd89bb |
|
|
|
dd89bb |
else if (
|
|
|
dd89bb |
(utf8[0] >= 0xF1) && (utf8[0] <= 0xF3)
|
|
|
dd89bb |
&& (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
|
|
|
dd89bb |
&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
|
|
|
dd89bb |
&& (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
|
|
|
dd89bb |
byte_count = 4;
|
|
|
dd89bb |
|
|
|
dd89bb |
else if (
|
|
|
dd89bb |
(utf8[0] == 0xF4)
|
|
|
dd89bb |
&& (utf8[1] >= 0x80) && (utf8[1] <= 0x8F)
|
|
|
dd89bb |
&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
|
|
|
dd89bb |
&& (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
|
|
|
dd89bb |
byte_count = 4;
|
|
|
dd89bb |
|
|
|
dd89bb |
if (byte_count) {
|
|
|
dd89bb |
(*code_points)++;
|
|
|
dd89bb |
callback_args->byte_count = byte_count;
|
|
|
dd89bb |
callback_fn[byte_count](callback_args);
|
|
|
dd89bb |
} else {
|
|
|
dd89bb |
if (addr_failed)
|
|
|
dd89bb |
*addr_failed = (void *)utf8;
|
|
|
dd89bb |
return NT_STATUS_ILLEGAL_CHARACTER;
|
|
|
dd89bb |
}
|
|
|
dd89bb |
|
|
|
dd89bb |
/* advance, transcode if needed */
|
|
|
dd89bb |
ch = callback_args->src;
|
|
|
dd89bb |
}
|
|
|
dd89bb |
|
|
|
dd89bb |
if ((ch < ch_boundary) && (*ch == 0))
|
|
|
dd89bb |
callback_fn[0](callback_args);
|
|
|
dd89bb |
|
|
|
dd89bb |
return NT_STATUS_SUCCESS;
|
|
|
dd89bb |
}
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
static int32_t __fastcall __default_callback_fn_utf16(nt_utf16_callback_args * args)
|
|
|
dd89bb |
{
|
|
|
dd89bb |
if (args->byte_count == 4)
|
|
|
dd89bb |
args->src += 2;
|
|
|
dd89bb |
else
|
|
|
dd89bb |
args->src++;
|
|
|
dd89bb |
|
|
|
dd89bb |
return NT_STATUS_SUCCESS;
|
|
|
dd89bb |
}
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
int32_t __stdcall __ntapi_uc_validate_unicode_stream_utf16(
|
|
|
dd89bb |
__in const wchar16_t * wch,
|
|
|
dd89bb |
__in size_t size_in_bytes __optional,
|
|
|
dd89bb |
__out size_t * code_points __optional,
|
|
|
dd89bb |
__out void ** addr_failed __optional,
|
|
|
dd89bb |
__in ntapi_uc_utf16_callback_fn ** callback_fn __optional,
|
|
|
dd89bb |
__in nt_utf16_callback_args * callback_args __optional)
|
|
|
dd89bb |
{
|
|
|
dd89bb |
const wchar16_t * wch_trail;
|
|
|
dd89bb |
wchar16_t * wch_boundary;
|
|
|
dd89bb |
unsigned char byte_count;
|
|
|
dd89bb |
size_t _code_points;
|
|
|
dd89bb |
|
|
|
dd89bb |
ntapi_uc_utf16_callback_fn * _callback_fn[5];
|
|
|
dd89bb |
nt_utf16_callback_args _callback_args;
|
|
|
dd89bb |
|
|
|
dd89bb |
if (!callback_fn) {
|
|
|
dd89bb |
_callback_fn[0] = __default_callback_fn_utf16;
|
|
|
dd89bb |
_callback_fn[1] = __default_callback_fn_utf16;
|
|
|
dd89bb |
_callback_fn[2] = __default_callback_fn_utf16;
|
|
|
dd89bb |
_callback_fn[3] = __default_callback_fn_utf16;
|
|
|
dd89bb |
_callback_fn[4] = __default_callback_fn_utf16;
|
|
|
dd89bb |
callback_fn = (ntapi_uc_utf16_callback_fn **)&_callback_fn;
|
|
|
dd89bb |
}
|
|
|
dd89bb |
|
|
|
dd89bb |
if (!callback_args) {
|
|
|
dd89bb |
callback_args = &_callback_args;
|
|
|
dd89bb |
callback_args->src = (wchar16_t *)0;
|
|
|
dd89bb |
}
|
|
|
dd89bb |
|
|
|
dd89bb |
if (callback_args->src)
|
|
|
dd89bb |
wch = callback_args->src;
|
|
|
dd89bb |
else
|
|
|
dd89bb |
callback_args->src = wch;
|
|
|
dd89bb |
|
|
|
dd89bb |
if (size_in_bytes)
|
|
|
dd89bb |
wch_boundary = (wchar16_t *)((uintptr_t)wch + size_in_bytes);
|
|
|
dd89bb |
else
|
|
|
dd89bb |
wch_boundary = (wchar16_t *)(~0);
|
|
|
dd89bb |
|
|
|
dd89bb |
if (!code_points)
|
|
|
dd89bb |
code_points = &_code_points;
|
|
|
dd89bb |
|
|
|
dd89bb |
while ((wch < wch_boundary) && (*wch)) {
|
|
|
dd89bb |
byte_count = 0;
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
if (*wch <= 0x7F)
|
|
|
dd89bb |
byte_count = 1;
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
else if (*wch <= 0x7FF)
|
|
|
dd89bb |
byte_count = 2;
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
else if ((*wch < 0xD800) || (*wch >= 0xE000))
|
|
|
dd89bb |
byte_count = 3;
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
else if ((*wch >= 0xD800) && (*wch < 0xDC00)) {
|
|
|
dd89bb |
wch_trail = wch + 1;
|
|
|
dd89bb |
|
|
|
dd89bb |
if ((wch_trail < wch_boundary)
|
|
|
dd89bb |
&& (*wch_trail >= 0xDC00)
|
|
|
dd89bb |
&& (*wch_trail < 0xE000))
|
|
|
dd89bb |
byte_count = 4;
|
|
|
dd89bb |
}
|
|
|
dd89bb |
|
|
|
dd89bb |
if (byte_count) {
|
|
|
dd89bb |
(*code_points)++;
|
|
|
dd89bb |
callback_args->byte_count = byte_count;
|
|
|
dd89bb |
callback_fn[byte_count](callback_args);
|
|
|
dd89bb |
} else {
|
|
|
dd89bb |
if (addr_failed)
|
|
|
dd89bb |
*addr_failed = (void *)wch;
|
|
|
dd89bb |
return NT_STATUS_ILLEGAL_CHARACTER;
|
|
|
dd89bb |
}
|
|
|
dd89bb |
|
|
|
dd89bb |
|
|
|
dd89bb |
wch = callback_args->src;
|
|
|
dd89bb |
}
|
|
|
dd89bb |
|
|
|
dd89bb |
if ((wch < wch_boundary) && (*wch == 0))
|
|
|
dd89bb |
callback_fn[0](callback_args);
|
|
|
dd89bb |
|
|
|
dd89bb |
return NT_STATUS_SUCCESS;
|
|
|
dd89bb |
}
|