Blame src/argv/ntapi_tt_array_utf8.c

dd89bb
/********************************************************/
dd89bb
/*  ntapi: Native API core library                      */
64e606
/*  Copyright (C) 2013--2021  SysDeer Technologies, LLC */
dd89bb
/*  Released under GPLv2 and GPLv3; see COPYING.NTAPI.  */
dd89bb
/********************************************************/
dd89bb
dd89bb
#include <psxtypes/psxtypes.h>
dd89bb
#include <pemagine/pemagine.h>
dd89bb
#include <ntapi/nt_argv.h>
dd89bb
#include <ntapi/ntapi.h>
dd89bb
#include "ntapi_impl.h"
dd89bb
4cd401
typedef struct ___two_bytes {
4cd401
	unsigned char	low;
4cd401
	unsigned char	high;
4cd401
} __two_bytes;
4cd401
4cd401
4cd401
typedef struct ___three_bytes {
4cd401
	unsigned char	low;
4cd401
	unsigned char	middle;
4cd401
	unsigned char	high;
4cd401
} __three_bytes;
4cd401
4cd401
static void __utf8_to_utf16_handler_1byte_or_null_termination(wchar16_t * dst, const unsigned char * ch)
4cd401
{
4cd401
	/***************************/
4cd401
	/* from: 0xxxxxxx          */
4cd401
	/* to:   00000000 0xxxxxxx */
4cd401
	/***************************/
4cd401
4cd401
	*dst = *ch;
4cd401
}
4cd401
4cd401
4cd401
static void __utf8_to_utf16_handler_2bytes(wchar16_t * dst, const unsigned char * ch)
4cd401
{
4cd401
	/***************************/
4cd401
	/* from: 110yyyyy 10xxxxxx */
4cd401
	/* to:   00000yyy yyxxxxxx */
4cd401
	/***************************/
4cd401
4cd401
	__two_bytes * src; /* big endian */
4cd401
4cd401
	src = (__two_bytes *)ch;
4cd401
4cd401
	/* yyyyy */
4cd401
	*dst   = (src->low ^ 0xC0);
4cd401
	*dst <<= 6;
4cd401
4cd401
	/* xxxxxx */
4cd401
	*dst |= (src->high ^ 0x80);
4cd401
}
4cd401
4cd401
4cd401
static void __utf8_to_utf16_handler_3bytes(wchar16_t * dst, const unsigned char * ch)
4cd401
{
4cd401
	/************************************/
4cd401
	/* from: 1110zzzz 10yyyyyy 10xxxxxx */
4cd401
	/* to:   zzzzyyyy yyxxxxxx          */
4cd401
	/************************************/
4cd401
4cd401
	__three_bytes *	src; /* big endian */
4cd401
	wchar16_t	yyyyy;
4cd401
4cd401
	src = (__three_bytes *)ch;
4cd401
4cd401
	/* zzzz */
4cd401
	*dst   = (src->low ^ 0xE0);
4cd401
	*dst <<= 12;
4cd401
4cd401
	/* yyyyy */
4cd401
	yyyyy   = (src->middle ^ 0x80);
4cd401
	yyyyy <<= 6;
4cd401
	*dst |= yyyyy;
4cd401
4cd401
	/* xxxxxx */
4cd401
	*dst |= (src->high ^ 0x80);
4cd401
}
4cd401
4cd401
4cd401
static void __utf8_to_utf16_handler_4bytes(wchar16_t * dst, const unsigned char * ch)
4cd401
{
4cd401
	/*************************************************/
4cd401
	/* from: 11110uuu  10uuzzzz  10yyyyyy  10xxxxxx  */
4cd401
	/* to:   110110ww  wwzzzzyy  110111yy  yyxxxxxx  */
4cd401
	/*************************************************/
4cd401
4cd401
	__two_bytes *		src_low;	/* big endian */
4cd401
	__two_bytes *		src_high;	/* big endian */
4cd401
	wchar16_t *		dst_lead;
4cd401
	wchar16_t *		dst_trail;
4cd401
4cd401
	wchar16_t		wwww;
4cd401
	wchar16_t		lead;
4cd401
	wchar16_t		trail;
4cd401
	unsigned char		ulow;
4cd401
	unsigned char		uhigh;
4cd401
	unsigned char		yy;
4cd401
	unsigned char		yyyy;
4cd401
	unsigned char		zzzz;
4cd401
4cd401
	dst_lead = dst_trail = (wchar16_t *)dst;
4cd401
	dst_trail++;
4cd401
4cd401
	src_low	= src_high = (__two_bytes *)ch;
4cd401
	src_high++;
4cd401
4cd401
	/* uuuuu */
4cd401
	ulow  = src_low->low  ^ 0xF0;
4cd401
	uhigh = src_low->high ^ 0x80;
4cd401
4cd401
	ulow  <<= 2;
4cd401
	uhigh >>= 4;
4cd401
4cd401
	/* wwww */
4cd401
	wwww   = (ulow | uhigh) - 1;
4cd401
	wwww <<= 6;
4cd401
4cd401
	/* 110110ww wwzzzzyy */
4cd401
	yy     = src_high->low ^ 0x80;
4cd401
	yy   >>= 4;
4cd401
4cd401
	zzzz   = src_low->high;
4cd401
	zzzz <<= 4;
4cd401
	zzzz >>= 2;
4cd401
4cd401
	lead   = 0xD800;
4cd401
	lead  |= wwww;
4cd401
	lead  |= zzzz;
4cd401
	lead  |= yy;
4cd401
4cd401
	/* 110111yy  yyxxxxxx */
4cd401
	yyyy   = src_high->low << 4;
4cd401
	trail  = yyyy << 2;
4cd401
	trail |= src_high->high ^ 0x80;
4cd401
	trail |= 0xDC00;
4cd401
4cd401
	/* write */
4cd401
	*dst_lead  = lead;
4cd401
	*dst_trail = trail;
4cd401
}
4cd401
dd89bb
int32_t __stdcall __ntapi_tt_array_copy_utf8(
dd89bb
	__out	int *			argc,
dd89bb
	__in	const char **		argv,
dd89bb
	__in	const char **		envp,
15812f
	__in	const char *		interp,
15812f
	__in	const char *		optarg,
15812f
	__in	const char *		script,
dd89bb
	__in	void *			base,
dd89bb
	__out	void *			buffer,
dd89bb
	__in	size_t			buflen,
dd89bb
	__out	size_t *		blklen)
dd89bb
{
dd89bb
	const char **	parg;
dd89bb
	const char *	arg;
15812f
	const char *	mark;
dd89bb
	char *		ch;
dd89bb
	ptrdiff_t	diff;
dd89bb
	ptrdiff_t	ptrs;
dd89bb
	size_t		needed;
15812f
	const char *	dummy[2] = {0,0};
c713d8
dd89bb
	/* fallback */
15812f
	argv = argv ? argv : dummy;
15812f
	envp = envp ? envp : dummy;
dd89bb
dd89bb
	/* ptrs, needed */
dd89bb
	ptrs   = 0;
dd89bb
	needed = 0;
dd89bb
15812f
	/* interpr */
15812f
	if (interp) {
15812f
		ptrs++;
15812f
		needed += sizeof(char *)
15812f
			+ __ntapi->tt_string_null_offset_multibyte(interp)
15812f
			+ sizeof(char);
15812f
	}
15812f
15812f
	/* optarg */
15812f
	if (optarg) {
dd89bb
		ptrs++;
dd89bb
		needed += sizeof(char *)
15812f
			+ __ntapi->tt_string_null_offset_multibyte(optarg)
dd89bb
			+ sizeof(char);
dd89bb
	}
dd89bb
15812f
	/* script / argv[0] */
15812f
	if ((mark = script ? script : argv[0])) {
15812f
		ptrs++;
15812f
		needed += sizeof(char *)
15812f
			+ __ntapi->tt_string_null_offset_multibyte(mark)
15812f
			+ sizeof(char);
15812f
	}
15812f
15812f
	/* argv */
15812f
	for (parg=&argv[1]; *parg; parg++)
dd89bb
		needed += sizeof(char *)
dd89bb
			+ __ntapi->tt_string_null_offset_multibyte(*parg)
dd89bb
			+ sizeof(char);
dd89bb
15812f
	ptrs += (parg - &argv[1]);
dd89bb
	*argc = (int)ptrs;
dd89bb
15812f
	/* envp */
dd89bb
	for (parg=envp; *parg; parg++)
dd89bb
		needed += sizeof(char *)
dd89bb
			+ __ntapi->tt_string_null_offset_multibyte(*parg)
dd89bb
			+ sizeof(char);
dd89bb
dd89bb
	ptrs += (parg - envp);
dd89bb
15812f
	ptrs   += 2;
15812f
	needed += 2*sizeof(char *);
dd89bb
	blklen  = blklen ? blklen : &needed;
dd89bb
	*blklen = needed;
dd89bb
dd89bb
	if (buflen < needed)
dd89bb
		return NT_STATUS_BUFFER_TOO_SMALL;
dd89bb
dd89bb
	/* init */
dd89bb
	parg = (const char **)buffer;
15812f
	ch   = (char *)(parg+ptrs);
dd89bb
	diff = (ptrdiff_t)base;
dd89bb
15812f
	/* interp */
15812f
	if (interp) {
15812f
		*parg++ = ch-diff;
15812f
		for (arg=interp; *arg; arg++,ch++)
15812f
			*ch = *arg;
15812f
		*ch++ = '\0';
15812f
	}
15812f
15812f
	/* optarg */
15812f
	if (optarg) {
15812f
		*parg++ = ch-diff;
15812f
		for (arg=optarg; *arg; arg++,ch++)
15812f
			*ch = *arg;
15812f
		*ch++ = '\0';
15812f
	}
15812f
15812f
	/* script / argv[0] */
15812f
	if ((mark = script ? script : argv[0])) {
dd89bb
		*parg++ = ch-diff;
15812f
		for (arg=mark; *arg; arg++,ch++)
dd89bb
			*ch = *arg;
dd89bb
		*ch++ = '\0';
dd89bb
	}
dd89bb
dd89bb
	/* argv */
15812f
	for (++argv; *argv; argv++) {
dd89bb
		*parg++=ch-diff;
dd89bb
		for (arg=*argv; *arg; arg++,ch++)
dd89bb
			*ch = *arg;
dd89bb
		*ch++ = '\0';
dd89bb
	}
dd89bb
dd89bb
	*parg++ = 0;
dd89bb
dd89bb
	/* envp */
dd89bb
	for (; *envp; envp++) {
dd89bb
		*parg++=ch-diff;
dd89bb
		for (arg=*envp; *arg; arg++,ch++)
dd89bb
			*ch = *arg;
dd89bb
		*ch++ = '\0';
dd89bb
	}
dd89bb
dd89bb
	*parg++ = 0;
dd89bb
dd89bb
	return NT_STATUS_SUCCESS;
dd89bb
}
dd89bb
4cd401
static void (*__utf8_to_utf16_handlers[5])(wchar16_t *, const unsigned char *) = {
4cd401
	0,
4cd401
	__utf8_to_utf16_handler_1byte_or_null_termination,
4cd401
	__utf8_to_utf16_handler_2bytes,
4cd401
	__utf8_to_utf16_handler_3bytes,
4cd401
	__utf8_to_utf16_handler_4bytes};
4cd401
dd89bb
int32_t __stdcall __ntapi_tt_array_convert_utf8_to_utf16(
dd89bb
	__in		char **		arrv,
4cd401
	__out		wchar16_t **	warrv,
4cd401
	__out		void *		base,
4cd401
	__out		wchar16_t *	buffer,
dd89bb
	__in		size_t		buffer_len,
dd89bb
	__out		size_t *	bytes_written)
dd89bb
{
4cd401
	wchar16_t *	ubound;
4cd401
	wchar16_t *	wch;
4cd401
	ptrdiff_t	diff;
4cd401
	ptrdiff_t	wdiff;
4cd401
	char *		ch;
4cd401
	const uint8_t *	utf8;
4cd401
	uint8_t		byte_count;
4cd401
4cd401
	if ((uintptr_t)base % sizeof(wchar16_t))
4cd401
		return NT_STATUS_INVALID_PARAMETER_3;
4cd401
4cd401
	wch   = buffer;
4cd401
	diff  = (ptrdiff_t)base;
4cd401
	wdiff = (ptrdiff_t)base / sizeof(wchar16_t);
4cd401
4cd401
	ubound  = buffer;
4cd401
	ubound += buffer_len / sizeof(wchar16_t);
4cd401
	ubound--;
4cd401
	ubound--;
4cd401
	ubound--;
4cd401
4cd401
	for (; arrv && *arrv; arrv++,warrv++) {
4cd401
		*warrv = wch - wdiff;
4cd401
		ch     = *arrv + diff;
4cd401
4cd401
		/* ubound already accounts for null termination, see above */
4cd401
		for (; *ch && (wch
4cd401
			utf8       = (const uint8_t *)ch;
4cd401
			byte_count = 0;
4cd401
4cd401
			/* try one byte */
4cd401
			if (utf8[0] <= 0x7F)
4cd401
				byte_count = 1;
4cd401
4cd401
			/* try two bytes */
4cd401
			else if ((++ch)
4cd401
					&& (utf8[0] >= 0xC2) && (utf8[0] <= 0xDF)
4cd401
					&& (utf8[1] >= 0x80) && (utf8[1] <= 0xBF))
4cd401
				byte_count = 2;
4cd401
4cd401
			/* try three bytes */
4cd401
			else if ((++ch)
4cd401
					&& (utf8[0] == 0xE0)
4cd401
					&& (utf8[1] >= 0xA0) && (utf8[1] <= 0xBF)
4cd401
					&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
4cd401
				byte_count = 3;
4cd401
4cd401
			else if (
4cd401
					(utf8[0] >= 0xE1) && (utf8[0] <= 0xEC)
4cd401
					&& (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
4cd401
					&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
4cd401
				byte_count = 3;
4cd401
4cd401
			else if (
4cd401
					(utf8[0] == 0xED)
4cd401
					&& (utf8[1] >= 0x80) && (utf8[1] <= 0x9F)
4cd401
					&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
4cd401
				byte_count = 3;
4cd401
4cd401
			else if (
4cd401
					(utf8[0] >= 0xEE) && (utf8[0] <= 0xEF)
4cd401
					&& (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
4cd401
					&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
4cd401
				byte_count = 3;
4cd401
4cd401
			/* try four bytes */
4cd401
			else if ((++ch)
4cd401
					&& (utf8[0] == 0xF0)
4cd401
					&& (utf8[1] >= 0x90) && (utf8[1] <= 0xBF)
4cd401
					&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
4cd401
					&& (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
4cd401
				byte_count = 4;
4cd401
4cd401
			else if (
4cd401
					(utf8[0] >= 0xF1) && (utf8[0] <= 0xF3)
4cd401
					&& (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
4cd401
					&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
4cd401
					&& (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
4cd401
				byte_count = 4;
4cd401
4cd401
			else if (
4cd401
					(utf8[0] == 0xF4)
4cd401
					&& (utf8[1] >= 0x80) && (utf8[1] <= 0x8F)
4cd401
					&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
4cd401
					&& (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
4cd401
				byte_count = 4;
4cd401
4cd401
			if (byte_count) {
4cd401
				__utf8_to_utf16_handlers[byte_count](wch,utf8);
4cd401
				wch = &wch[byte_count >> 3];
4cd401
				wch++;
4cd401
			} else {
4cd401
				return NT_STATUS_ILLEGAL_CHARACTER;
4cd401
			}
4cd401
		}
4cd401
4cd401
		*wch++ = 0;
4cd401
	}
1cf951
4cd401
	*warrv = 0;
4cd401
	*bytes_written = sizeof(wchar16_t) * (wch - buffer);
c713d8
dd89bb
	return NT_STATUS_SUCCESS;
dd89bb
}