diff --git a/src/argv/ntapi_tt_array_utf8.c b/src/argv/ntapi_tt_array_utf8.c index b1c95b5..a2f6d0f 100644 --- a/src/argv/ntapi_tt_array_utf8.c +++ b/src/argv/ntapi_tt_array_utf8.c @@ -10,6 +10,137 @@ #include #include "ntapi_impl.h" +typedef struct ___two_bytes { + unsigned char low; + unsigned char high; +} __two_bytes; + + +typedef struct ___three_bytes { + unsigned char low; + unsigned char middle; + unsigned char high; +} __three_bytes; + +static void __utf8_to_utf16_handler_1byte_or_null_termination(wchar16_t * dst, const unsigned char * ch) +{ + /***************************/ + /* from: 0xxxxxxx */ + /* to: 00000000 0xxxxxxx */ + /***************************/ + + *dst = *ch; +} + + +static void __utf8_to_utf16_handler_2bytes(wchar16_t * dst, const unsigned char * ch) +{ + /***************************/ + /* from: 110yyyyy 10xxxxxx */ + /* to: 00000yyy yyxxxxxx */ + /***************************/ + + __two_bytes * src; /* big endian */ + + src = (__two_bytes *)ch; + + /* yyyyy */ + *dst = (src->low ^ 0xC0); + *dst <<= 6; + + /* xxxxxx */ + *dst |= (src->high ^ 0x80); +} + + +static void __utf8_to_utf16_handler_3bytes(wchar16_t * dst, const unsigned char * ch) +{ + /************************************/ + /* from: 1110zzzz 10yyyyyy 10xxxxxx */ + /* to: zzzzyyyy yyxxxxxx */ + /************************************/ + + __three_bytes * src; /* big endian */ + wchar16_t yyyyy; + + src = (__three_bytes *)ch; + + /* zzzz */ + *dst = (src->low ^ 0xE0); + *dst <<= 12; + + /* yyyyy */ + yyyyy = (src->middle ^ 0x80); + yyyyy <<= 6; + *dst |= yyyyy; + + /* xxxxxx */ + *dst |= (src->high ^ 0x80); +} + + +static void __utf8_to_utf16_handler_4bytes(wchar16_t * dst, const unsigned char * ch) +{ + /*************************************************/ + /* from: 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx */ + /* to: 110110ww wwzzzzyy 110111yy yyxxxxxx */ + /*************************************************/ + + __two_bytes * src_low; /* big endian */ + __two_bytes * src_high; /* big endian */ + wchar16_t * dst_lead; + wchar16_t * dst_trail; + + wchar16_t wwww; + wchar16_t lead; + wchar16_t trail; + unsigned char ulow; + unsigned char uhigh; + unsigned char yy; + unsigned char yyyy; + unsigned char zzzz; + + dst_lead = dst_trail = (wchar16_t *)dst; + dst_trail++; + + src_low = src_high = (__two_bytes *)ch; + src_high++; + + /* uuuuu */ + ulow = src_low->low ^ 0xF0; + uhigh = src_low->high ^ 0x80; + + ulow <<= 2; + uhigh >>= 4; + + /* wwww */ + wwww = (ulow | uhigh) - 1; + wwww <<= 6; + + /* 110110ww wwzzzzyy */ + yy = src_high->low ^ 0x80; + yy >>= 4; + + zzzz = src_low->high; + zzzz <<= 4; + zzzz >>= 2; + + lead = 0xD800; + lead |= wwww; + lead |= zzzz; + lead |= yy; + + /* 110111yy yyxxxxxx */ + yyyy = src_high->low << 4; + trail = yyyy << 2; + trail |= src_high->high ^ 0x80; + trail |= 0xDC00; + + /* write */ + *dst_lead = lead; + *dst_trail = trail; +} + int32_t __stdcall __ntapi_tt_array_copy_utf8( __out int * argc, __in const char ** argv, @@ -140,21 +271,122 @@ int32_t __stdcall __ntapi_tt_array_copy_utf8( return NT_STATUS_SUCCESS; } +static void (*__utf8_to_utf16_handlers[5])(wchar16_t *, const unsigned char *) = { + 0, + __utf8_to_utf16_handler_1byte_or_null_termination, + __utf8_to_utf16_handler_2bytes, + __utf8_to_utf16_handler_3bytes, + __utf8_to_utf16_handler_4bytes}; + int32_t __stdcall __ntapi_tt_array_convert_utf8_to_utf16( __in char ** arrv, - __in wchar16_t ** arra, - __in void * base, - __in wchar16_t * buffer, + __out wchar16_t ** warrv, + __out void * base, + __out wchar16_t * buffer, __in size_t buffer_len, __out size_t * bytes_written) { - (void)arrv; - (void)arra; - (void)base; - (void)buffer; - (void)buffer_len; + wchar16_t * ubound; + wchar16_t * wch; + ptrdiff_t diff; + ptrdiff_t wdiff; + char * ch; + const uint8_t * utf8; + uint8_t byte_count; + + if ((uintptr_t)base % sizeof(wchar16_t)) + return NT_STATUS_INVALID_PARAMETER_3; + + wch = buffer; + diff = (ptrdiff_t)base; + wdiff = (ptrdiff_t)base / sizeof(wchar16_t); + + ubound = buffer; + ubound += buffer_len / sizeof(wchar16_t); + ubound--; + ubound--; + ubound--; + + for (; arrv && *arrv; arrv++,warrv++) { + *warrv = wch - wdiff; + ch = *arrv + diff; + + /* ubound already accounts for null termination, see above */ + for (; *ch && (wch= 0xC2) && (utf8[0] <= 0xDF) + && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)) + byte_count = 2; + + /* try three bytes */ + else if ((++ch) + && (utf8[0] == 0xE0) + && (utf8[1] >= 0xA0) && (utf8[1] <= 0xBF) + && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)) + byte_count = 3; + + else if ( + (utf8[0] >= 0xE1) && (utf8[0] <= 0xEC) + && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF) + && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)) + byte_count = 3; + + else if ( + (utf8[0] == 0xED) + && (utf8[1] >= 0x80) && (utf8[1] <= 0x9F) + && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)) + byte_count = 3; + + else if ( + (utf8[0] >= 0xEE) && (utf8[0] <= 0xEF) + && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF) + && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)) + byte_count = 3; + + /* try four bytes */ + else if ((++ch) + && (utf8[0] == 0xF0) + && (utf8[1] >= 0x90) && (utf8[1] <= 0xBF) + && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF) + && (utf8[3] >= 0x80) && (utf8[3] <= 0xBF)) + byte_count = 4; + + else if ( + (utf8[0] >= 0xF1) && (utf8[0] <= 0xF3) + && (utf8[1] >= 0x80) && (utf8[1] <= 0xBF) + && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF) + && (utf8[3] >= 0x80) && (utf8[3] <= 0xBF)) + byte_count = 4; + + else if ( + (utf8[0] == 0xF4) + && (utf8[1] >= 0x80) && (utf8[1] <= 0x8F) + && (utf8[2] >= 0x80) && (utf8[2] <= 0xBF) + && (utf8[3] >= 0x80) && (utf8[3] <= 0xBF)) + byte_count = 4; + + if (byte_count) { + __utf8_to_utf16_handlers[byte_count](wch,utf8); + wch = &wch[byte_count >> 3]; + wch++; + } else { + return NT_STATUS_ILLEGAL_CHARACTER; + } + } + + *wch++ = 0; + } - *bytes_written = 0; + *warrv = 0; + *bytes_written = sizeof(wchar16_t) * (wch - buffer); return NT_STATUS_SUCCESS; }