Blame src/unicode/ntapi_uc_unicode_validation.c

dd89bb
/********************************************************/
dd89bb
/*  ntapi: Native API core library                      */
59d585
/*  Copyright (C) 2013--2021  Z. Gilboa                 */
dd89bb
/*  Released under GPLv2 and GPLv3; see COPYING.NTAPI.  */
dd89bb
/********************************************************/
dd89bb
dd89bb
#include <psxtypes/psxtypes.h>
dd89bb
#include <ntapi/nt_status.h>
dd89bb
#include <ntapi/nt_unicode.h>
766a63
#include "ntapi_impl.h"
dd89bb
dd89bb
/**
dd89bb
 *  unofficial bit distribution table for comprehension purposes only
dd89bb
 *
dd89bb
 *  scalar	nickname	utf-16		utf-8[0]  utf-8[1]  utf-8[2]  utf-8[3]
dd89bb
 *  ------	--------	--------	--------  --------  --------  --------
dd89bb
 *  00000000	7x		00000000	0xxxxxxx
dd89bb
 *  0xxxxxxx			0xxxxxxx
dd89bb
 *
dd89bb
 *  00000yyy	5y6x		00000yyy	110yyyyy  10xxxxxx
dd89bb
 *  yyxxxxxx			yyxxxxxx
dd89bb
 *
dd89bb
 *  zzzzyyyy	4z6y6x		zzzzyyyy	1110zzzz  10yyyyyy  10xxxxxx
dd89bb
 *  yyxxxxxx			yyxxxxxx
dd89bb
 *
dd89bb
 *  000uuuuu	5u4z6y6x	110110ww	11110uuu  10uuzzzz  10yyyyyy  10xxxxxx
dd89bb
 *  zzzzyyyy			wwzzzzyy
dd89bb
 *  yyxxxxxx			110111yy
dd89bb
 *				yyxxxxxx        (where wwww = uuuuu - 1)
dd89bb
 *
dd89bb
 *
dd89bb
 *  validation of utf-8
dd89bb
 *
dd89bb
 *  from        to          utf-8[0]      utf-8[1]      utf-8[2]      utf-8[3]
dd89bb
 *  ------      ------      --------      --------      --------      --------
dd89bb
 *  0x0000      0x007F      00..7F
dd89bb
 *  0x0080      0x07FF      C2..DF        80..BF
dd89bb
 *  0x0800      0x0FFF      E0            A0..BF        80..BF
dd89bb
 *  0x1000      0xCFFF      E1..EC        80..BF        80..BF
dd89bb
 *  0xD000      0xD7FF      ED            80..9F        80..BF
dd89bb
 *  0xE000      0xFFFF      EE..EF        80..BF        80..BF
dd89bb
 *  0x10000     0x3FFFF     F0            90..BF        80..BF        80..BF
dd89bb
 *  0x40000     0xFFFFF     F1..F3        80..BF        80..BF        80..BF
dd89bb
 *  0x100000    0x10FFFF    F4            80..8F        80..BF        80..BF
dd89bb
 *
dd89bb
**/
dd89bb
dd89bb
dd89bb
#define __AVAILABLE_CODE_POINTS	0x110000
dd89bb
dd89bb
int __stdcall __ntapi_uc_get_code_point_byte_count_utf8(uint32_t code_point)
dd89bb
{
dd89bb
	/* try clearing 7x bits */
dd89bb
	if ((code_point >> 7) == 0)
dd89bb
		return 1;
dd89bb
dd89bb
	/* try clearing 5y + 6x bits */
dd89bb
	else if ((code_point >> 11) == 0)
dd89bb
		return 2;
dd89bb
dd89bb
	/* try clearing 4z +6y + 6x bits */
dd89bb
	else if ((code_point >> 16) == 0)
dd89bb
		return 3;
dd89bb
dd89bb
	/* try clearing 5u + 4z + 6y + 6x bits */
dd89bb
	else if ((code_point >> 21) == 0)
dd89bb
		return 4;
dd89bb
dd89bb
	/* __AVAILABLE_CODE_POINTS exceeded */
dd89bb
	else
dd89bb
		return 0;
dd89bb
}
dd89bb
dd89bb
dd89bb
int __stdcall __ntapi_uc_get_code_point_byte_count_utf16(uint32_t code_point)
dd89bb
{
dd89bb
	/* try clearing 4z +6y + 6x bits */
dd89bb
	if ((code_point >> 16) == 0)
dd89bb
		return 2;
dd89bb
dd89bb
	/* try clearing 5u + 4z + 6y + 6x bits */
dd89bb
	else if ((code_point >> 21) == 0)
dd89bb
		return 4;
dd89bb
dd89bb
	/* __AVAILABLE_CODE_POINTS exceeded */
dd89bb
	else
dd89bb
		return 0;
dd89bb
}
dd89bb
dd89bb
dd89bb
/**
dd89bb
 *  following is a straight-forward implementation
dd89bb
 *  of unicode conversion and validation (see also:
dd89bb
 *  Table 3-7 of the Unicode Standard, version 6.2).
dd89bb
 *
dd89bb
 *  the use of callbacks allows the validation
dd89bb
 *  functions to be the basis of our utf-8 conversion
dd89bb
 *  functions on the one hand, and the posix path arg
dd89bb
 *  normalization routine on the other.
dd89bb
**/
dd89bb
dd89bb
static int32_t __fastcall __default_callback_fn_utf8(nt_utf8_callback_args * args)
dd89bb
{
dd89bb
	args->src += args->byte_count;
dd89bb
	return NT_STATUS_SUCCESS;
dd89bb
}
dd89bb
dd89bb
int32_t __stdcall __ntapi_uc_validate_unicode_stream_utf8(
dd89bb
	__in	const unsigned char *		ch,
dd89bb
	__in	size_t				size_in_bytes	__optional,
dd89bb
	__out	size_t *			code_points	__optional,
0e7864
	__out	const unsigned char **		addr_failed	__optional,
dd89bb
	__in	ntapi_uc_utf8_callback_fn **	callback_fn	__optional,
dd89bb
	__in	nt_utf8_callback_args *		callback_args	__optional)
dd89bb
{
099813
	int32_t			status;
dd89bb
	const unsigned char *	utf8;
dd89bb
	unsigned char *		ch_boundary;
dd89bb
	unsigned char		byte_count;
dd89bb
	size_t			_code_points;
dd89bb
dd89bb
	ntapi_uc_utf8_callback_fn *	_callback_fn[5];
dd89bb
	nt_utf8_callback_args		_callback_args;
dd89bb
dd89bb
	if (!callback_fn) {
dd89bb
		_callback_fn[0] = __default_callback_fn_utf8;
dd89bb
		_callback_fn[1] = __default_callback_fn_utf8;
dd89bb
		_callback_fn[2] = __default_callback_fn_utf8;
dd89bb
		_callback_fn[3] = __default_callback_fn_utf8;
dd89bb
		_callback_fn[4] = __default_callback_fn_utf8;
dd89bb
		callback_fn = (ntapi_uc_utf8_callback_fn **)&_callback_fn;
dd89bb
	}
dd89bb
dd89bb
	if (!callback_args) {
dd89bb
		callback_args = &_callback_args;
dd89bb
		callback_args->src = (unsigned char *)0;
dd89bb
	}
dd89bb
dd89bb
	if (callback_args->src)
dd89bb
		ch = callback_args->src;
dd89bb
	else
dd89bb
		callback_args->src = ch;
dd89bb
dd89bb
	if (size_in_bytes)
dd89bb
		ch_boundary = (unsigned char *)((uintptr_t)ch + size_in_bytes);
dd89bb
	else
dd89bb
		ch_boundary = (unsigned char *)(~0);
dd89bb
dd89bb
	if (!code_points)
dd89bb
		code_points = &_code_points;
dd89bb
dd89bb
	while ((ch < ch_boundary) && (*ch)) {
dd89bb
		utf8 		= ch;
dd89bb
		byte_count	= 0;
dd89bb
dd89bb
		/* try one byte */
dd89bb
		if (utf8[0] <= 0x7F)
dd89bb
			byte_count = 1;
dd89bb
dd89bb
		/* try two bytes */
dd89bb
		else if ((++ch < ch_boundary)
dd89bb
				&& (utf8[0] >= 0xC2) && (utf8[0] <= 0xDF)
dd89bb
				&& (utf8[1] >= 0x80) && (utf8[1] <= 0xBF))
dd89bb
			byte_count = 2;
dd89bb
dd89bb
		/* try three bytes */
dd89bb
		else if ((++ch < ch_boundary)
dd89bb
				&& (utf8[0] == 0xE0)
dd89bb
				&& (utf8[1] >= 0xA0) && (utf8[1] <= 0xBF)
dd89bb
				&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
dd89bb
			byte_count = 3;
dd89bb
dd89bb
		else if (
dd89bb
				(utf8[0] >= 0xE1) && (utf8[0] <= 0xEC)
dd89bb
				&& (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
dd89bb
				&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
dd89bb
			byte_count = 3;
dd89bb
dd89bb
		else if (
dd89bb
				(utf8[0] == 0xED)
dd89bb
				&& (utf8[1] >= 0x80) && (utf8[1] <= 0x9F)
dd89bb
				&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
dd89bb
			byte_count = 3;
dd89bb
dd89bb
		else if (
dd89bb
				(utf8[0] >= 0xEE) && (utf8[0] <= 0xEF)
dd89bb
				&& (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
dd89bb
				&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
dd89bb
			byte_count = 3;
dd89bb
dd89bb
		/* try four bytes */
dd89bb
		else if ((++ch < ch_boundary)
dd89bb
				&& (utf8[0] == 0xF0)
dd89bb
				&& (utf8[1] >= 0x90) && (utf8[1] <= 0xBF)
dd89bb
				&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
dd89bb
				&& (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
dd89bb
			byte_count = 4;
dd89bb
dd89bb
		else if (
dd89bb
				(utf8[0] >= 0xF1) && (utf8[0] <= 0xF3)
dd89bb
				&& (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
dd89bb
				&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
dd89bb
				&& (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
dd89bb
			byte_count = 4;
dd89bb
dd89bb
		else if (
dd89bb
				(utf8[0] == 0xF4)
dd89bb
				&& (utf8[1] >= 0x80) && (utf8[1] <= 0x8F)
dd89bb
				&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
dd89bb
				&& (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
dd89bb
			byte_count = 4;
dd89bb
dd89bb
		if (byte_count) {
dd89bb
			(*code_points)++;
dd89bb
			callback_args->byte_count = byte_count;
099813
099813
			if ((status = callback_fn[byte_count](callback_args)))
099813
				return status;
dd89bb
		} else {
dd89bb
			if (addr_failed)
dd89bb
				*addr_failed = (void *)utf8;
dd89bb
			return NT_STATUS_ILLEGAL_CHARACTER;
dd89bb
		}
dd89bb
dd89bb
		/* advance, transcode if needed */
dd89bb
		ch = callback_args->src;
dd89bb
	}
dd89bb
e2e674
	if ((ch < ch_boundary) && (*ch == 0)) {
e2e674
		callback_args->byte_count = 1;
099813
		return callback_fn[0](callback_args);
e2e674
	}
dd89bb
dd89bb
	return NT_STATUS_SUCCESS;
dd89bb
}
dd89bb
dd89bb
dd89bb
static int32_t __fastcall __default_callback_fn_utf16(nt_utf16_callback_args * args)
dd89bb
{
dd89bb
	if (args->byte_count == 4)
dd89bb
		args->src += 2;
dd89bb
	else
dd89bb
		args->src++;
dd89bb
dd89bb
	return NT_STATUS_SUCCESS;
dd89bb
}
dd89bb
dd89bb
dd89bb
int32_t __stdcall __ntapi_uc_validate_unicode_stream_utf16(
dd89bb
	__in	const wchar16_t *		wch,
dd89bb
	__in	size_t				size_in_bytes	__optional,
dd89bb
	__out	size_t *			code_points	__optional,
0e7864
	__out	const wchar16_t **		addr_failed	__optional,
dd89bb
	__in	ntapi_uc_utf16_callback_fn **	callback_fn	__optional,
dd89bb
	__in	nt_utf16_callback_args *	callback_args	__optional)
dd89bb
{
7845d0
	int32_t		  status;
dd89bb
	const wchar16_t * wch_trail;
dd89bb
	wchar16_t *	  wch_boundary;
dd89bb
	unsigned char	  byte_count;
dd89bb
	size_t		  _code_points;
dd89bb
dd89bb
	ntapi_uc_utf16_callback_fn *	_callback_fn[5];
dd89bb
	nt_utf16_callback_args		_callback_args;
dd89bb
dd89bb
	if (!callback_fn) {
dd89bb
		_callback_fn[0] = __default_callback_fn_utf16;
dd89bb
		_callback_fn[1] = __default_callback_fn_utf16;
dd89bb
		_callback_fn[2] = __default_callback_fn_utf16;
dd89bb
		_callback_fn[3] = __default_callback_fn_utf16;
dd89bb
		_callback_fn[4] = __default_callback_fn_utf16;
dd89bb
		callback_fn = (ntapi_uc_utf16_callback_fn **)&_callback_fn;
dd89bb
	}
dd89bb
dd89bb
	if (!callback_args) {
dd89bb
		callback_args = &_callback_args;
dd89bb
		callback_args->src = (wchar16_t *)0;
dd89bb
	}
dd89bb
dd89bb
	if (callback_args->src)
dd89bb
		wch = callback_args->src;
dd89bb
	else
dd89bb
		callback_args->src = wch;
dd89bb
dd89bb
	if (size_in_bytes)
dd89bb
		wch_boundary = (wchar16_t *)((uintptr_t)wch + size_in_bytes);
dd89bb
	else
dd89bb
		wch_boundary = (wchar16_t *)(~0);
dd89bb
dd89bb
	if (!code_points)
dd89bb
		code_points = &_code_points;
dd89bb
dd89bb
	while ((wch < wch_boundary) && (*wch)) {
dd89bb
		byte_count	= 0;
dd89bb
dd89bb
		/* try one byte */
dd89bb
		if (*wch <= 0x7F)
dd89bb
			byte_count = 1;
dd89bb
dd89bb
		/* try two bytes */
dd89bb
		else if (*wch <= 0x7FF)
dd89bb
			byte_count = 2;
dd89bb
dd89bb
		/* try three bytes */
dd89bb
		else if ((*wch < 0xD800) || (*wch >= 0xE000))
dd89bb
			byte_count = 3;
dd89bb
dd89bb
		/* try four bytes */
dd89bb
		else if ((*wch >= 0xD800) && (*wch < 0xDC00)) {
dd89bb
			wch_trail = wch + 1;
dd89bb
dd89bb
			if ((wch_trail < wch_boundary)
dd89bb
					&& (*wch_trail >= 0xDC00)
dd89bb
					&& (*wch_trail < 0xE000))
dd89bb
				byte_count = 4;
dd89bb
		}
dd89bb
dd89bb
		if (byte_count) {
dd89bb
			(*code_points)++;
dd89bb
			callback_args->byte_count = byte_count;
7845d0
7845d0
			if ((status = callback_fn[byte_count](callback_args)))
7845d0
				return status;
dd89bb
		} else {
dd89bb
			if (addr_failed)
dd89bb
				*addr_failed = (void *)wch;
dd89bb
			return NT_STATUS_ILLEGAL_CHARACTER;
dd89bb
		}
dd89bb
dd89bb
		/* advance, transcode as needed */
dd89bb
		wch = callback_args->src;
dd89bb
	}
dd89bb
e2e674
	if ((wch < wch_boundary) && (*wch == 0)) {
e2e674
		callback_args->byte_count = 1;
7845d0
		return callback_fn[0](callback_args);
e2e674
	}
dd89bb
dd89bb
	return NT_STATUS_SUCCESS;
dd89bb
}