Blob Blame History Raw
/********************************************************/
/*  ntapi: Native API core library                      */
/*  Copyright (C) 2013--2021  SysDeer Technologies, LLC */
/*  Released under GPLv2 and GPLv3; see COPYING.NTAPI.  */
/********************************************************/

#include <psxtypes/psxtypes.h>
#include <pemagine/pemagine.h>
#include <ntapi/nt_argv.h>
#include <ntapi/ntapi.h>
#include "ntapi_impl.h"

typedef struct ___two_bytes {
	unsigned char	low;
	unsigned char	high;
} __two_bytes;


typedef struct ___three_bytes {
	unsigned char	low;
	unsigned char	middle;
	unsigned char	high;
} __three_bytes;

static void __utf8_to_utf16_handler_1byte_or_null_termination(wchar16_t * dst, const unsigned char * ch)
{
	/***************************/
	/* from: 0xxxxxxx          */
	/* to:   00000000 0xxxxxxx */
	/***************************/

	*dst = *ch;
}


static void __utf8_to_utf16_handler_2bytes(wchar16_t * dst, const unsigned char * ch)
{
	/***************************/
	/* from: 110yyyyy 10xxxxxx */
	/* to:   00000yyy yyxxxxxx */
	/***************************/

	__two_bytes * src; /* big endian */

	src = (__two_bytes *)ch;

	/* yyyyy */
	*dst   = (src->low ^ 0xC0);
	*dst <<= 6;

	/* xxxxxx */
	*dst |= (src->high ^ 0x80);
}


static void __utf8_to_utf16_handler_3bytes(wchar16_t * dst, const unsigned char * ch)
{
	/************************************/
	/* from: 1110zzzz 10yyyyyy 10xxxxxx */
	/* to:   zzzzyyyy yyxxxxxx          */
	/************************************/

	__three_bytes *	src; /* big endian */
	wchar16_t	yyyyy;

	src = (__three_bytes *)ch;

	/* zzzz */
	*dst   = (src->low ^ 0xE0);
	*dst <<= 12;

	/* yyyyy */
	yyyyy   = (src->middle ^ 0x80);
	yyyyy <<= 6;
	*dst |= yyyyy;

	/* xxxxxx */
	*dst |= (src->high ^ 0x80);
}


static void __utf8_to_utf16_handler_4bytes(wchar16_t * dst, const unsigned char * ch)
{
	/*************************************************/
	/* from: 11110uuu  10uuzzzz  10yyyyyy  10xxxxxx  */
	/* to:   110110ww  wwzzzzyy  110111yy  yyxxxxxx  */
	/*************************************************/

	__two_bytes *		src_low;	/* big endian */
	__two_bytes *		src_high;	/* big endian */
	wchar16_t *		dst_lead;
	wchar16_t *		dst_trail;

	wchar16_t		wwww;
	wchar16_t		lead;
	wchar16_t		trail;
	unsigned char		ulow;
	unsigned char		uhigh;
	unsigned char		yy;
	unsigned char		yyyy;
	unsigned char		zzzz;

	dst_lead = dst_trail = (wchar16_t *)dst;
	dst_trail++;

	src_low	= src_high = (__two_bytes *)ch;
	src_high++;

	/* uuuuu */
	ulow  = src_low->low  ^ 0xF0;
	uhigh = src_low->high ^ 0x80;

	ulow  <<= 2;
	uhigh >>= 4;

	/* wwww */
	wwww   = (ulow | uhigh) - 1;
	wwww <<= 6;

	/* 110110ww wwzzzzyy */
	yy     = src_high->low ^ 0x80;
	yy   >>= 4;

	zzzz   = src_low->high;
	zzzz <<= 4;
	zzzz >>= 2;

	lead   = 0xD800;
	lead  |= wwww;
	lead  |= zzzz;
	lead  |= yy;

	/* 110111yy  yyxxxxxx */
	yyyy   = src_high->low << 4;
	trail  = yyyy << 2;
	trail |= src_high->high ^ 0x80;
	trail |= 0xDC00;

	/* write */
	*dst_lead  = lead;
	*dst_trail = trail;
}

int32_t __stdcall __ntapi_tt_array_copy_utf8(
	__out	int *			argc,
	__in	const char **		argv,
	__in	const char **		envp,
	__in	const char *		interp,
	__in	const char *		optarg,
	__in	const char *		script,
	__in	void *			base,
	__out	void *			buffer,
	__in	size_t			buflen,
	__out	size_t *		blklen)
{
	const char **	parg;
	const char *	arg;
	const char *	mark;
	char *		ch;
	ptrdiff_t	diff;
	ptrdiff_t	ptrs;
	size_t		needed;
	const char *	dummy[2] = {0,0};

	/* fallback */
	argv = argv ? argv : dummy;
	envp = envp ? envp : dummy;

	/* ptrs, needed */
	ptrs   = 0;
	needed = 0;

	/* interpr */
	if (interp) {
		ptrs++;
		needed += sizeof(char *)
			+ __ntapi->tt_string_null_offset_multibyte(interp)
			+ sizeof(char);
	}

	/* optarg */
	if (optarg) {
		ptrs++;
		needed += sizeof(char *)
			+ __ntapi->tt_string_null_offset_multibyte(optarg)
			+ sizeof(char);
	}

	/* script / argv[0] */
	if ((mark = script ? script : argv[0])) {
		ptrs++;
		needed += sizeof(char *)
			+ __ntapi->tt_string_null_offset_multibyte(mark)
			+ sizeof(char);
	}

	/* argv */
	for (parg=&argv[1]; *parg; parg++)
		needed += sizeof(char *)
			+ __ntapi->tt_string_null_offset_multibyte(*parg)
			+ sizeof(char);

	ptrs += (parg - &argv[1]);
	*argc = (int)ptrs;

	/* envp */
	for (parg=envp; *parg; parg++)
		needed += sizeof(char *)
			+ __ntapi->tt_string_null_offset_multibyte(*parg)
			+ sizeof(char);

	ptrs += (parg - envp);

	ptrs   += 2;
	needed += 2*sizeof(char *);
	blklen  = blklen ? blklen : &needed;
	*blklen = needed;

	if (buflen < needed)
		return NT_STATUS_BUFFER_TOO_SMALL;

	/* init */
	parg = (const char **)buffer;
	ch   = (char *)(parg+ptrs);
	diff = (ptrdiff_t)base;

	/* interp */
	if (interp) {
		*parg++ = ch-diff;
		for (arg=interp; *arg; arg++,ch++)
			*ch = *arg;
		*ch++ = '\0';
	}

	/* optarg */
	if (optarg) {
		*parg++ = ch-diff;
		for (arg=optarg; *arg; arg++,ch++)
			*ch = *arg;
		*ch++ = '\0';
	}

	/* script / argv[0] */
	if ((mark = script ? script : argv[0])) {
		*parg++ = ch-diff;
		for (arg=mark; *arg; arg++,ch++)
			*ch = *arg;
		*ch++ = '\0';
	}

	/* argv */
	for (++argv; *argv; argv++) {
		*parg++=ch-diff;
		for (arg=*argv; *arg; arg++,ch++)
			*ch = *arg;
		*ch++ = '\0';
	}

	*parg++ = 0;

	/* envp */
	for (; *envp; envp++) {
		*parg++=ch-diff;
		for (arg=*envp; *arg; arg++,ch++)
			*ch = *arg;
		*ch++ = '\0';
	}

	*parg++ = 0;

	return NT_STATUS_SUCCESS;
}

static void (*__utf8_to_utf16_handlers[5])(wchar16_t *, const unsigned char *) = {
	0,
	__utf8_to_utf16_handler_1byte_or_null_termination,
	__utf8_to_utf16_handler_2bytes,
	__utf8_to_utf16_handler_3bytes,
	__utf8_to_utf16_handler_4bytes};

int32_t __stdcall __ntapi_tt_array_convert_utf8_to_utf16(
	__in		char **		arrv,
	__out		wchar16_t **	warrv,
	__out		void *		base,
	__out		wchar16_t *	buffer,
	__in		size_t		buffer_len,
	__out		size_t *	bytes_written)
{
	wchar16_t *	ubound;
	wchar16_t *	wch;
	ptrdiff_t	diff;
	ptrdiff_t	wdiff;
	char *		ch;
	const uint8_t *	utf8;
	uint8_t		byte_count;

	if ((uintptr_t)base % sizeof(wchar16_t))
		return NT_STATUS_INVALID_PARAMETER_3;

	wch   = buffer;
	diff  = (ptrdiff_t)base;
	wdiff = (ptrdiff_t)base / sizeof(wchar16_t);

	ubound  = buffer;
	ubound += buffer_len / sizeof(wchar16_t);
	ubound--;
	ubound--;
	ubound--;

	for (; arrv && *arrv && (wch<ubound); arrv++,warrv++) {
		*warrv = wch - wdiff;
		ch     = *arrv + diff;

		/* ubound already accounts for null termination, see above */
		for (; *ch && (wch<ubound); ch++) {
			utf8       = (const uint8_t *)ch;
			byte_count = 0;

			/* try one byte */
			if (utf8[0] <= 0x7F)
				byte_count = 1;

			/* try two bytes */
			else if ((++ch)
					&& (utf8[0] >= 0xC2) && (utf8[0] <= 0xDF)
					&& (utf8[1] >= 0x80) && (utf8[1] <= 0xBF))
				byte_count = 2;

			/* try three bytes */
			else if ((++ch)
					&& (utf8[0] == 0xE0)
					&& (utf8[1] >= 0xA0) && (utf8[1] <= 0xBF)
					&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
				byte_count = 3;

			else if (
					(utf8[0] >= 0xE1) && (utf8[0] <= 0xEC)
					&& (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
					&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
				byte_count = 3;

			else if (
					(utf8[0] == 0xED)
					&& (utf8[1] >= 0x80) && (utf8[1] <= 0x9F)
					&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
				byte_count = 3;

			else if (
					(utf8[0] >= 0xEE) && (utf8[0] <= 0xEF)
					&& (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
					&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF))
				byte_count = 3;

			/* try four bytes */
			else if ((++ch)
					&& (utf8[0] == 0xF0)
					&& (utf8[1] >= 0x90) && (utf8[1] <= 0xBF)
					&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
					&& (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
				byte_count = 4;

			else if (
					(utf8[0] >= 0xF1) && (utf8[0] <= 0xF3)
					&& (utf8[1] >= 0x80) && (utf8[1] <= 0xBF)
					&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
					&& (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
				byte_count = 4;

			else if (
					(utf8[0] == 0xF4)
					&& (utf8[1] >= 0x80) && (utf8[1] <= 0x8F)
					&& (utf8[2] >= 0x80) && (utf8[2] <= 0xBF)
					&& (utf8[3] >= 0x80) && (utf8[3] <= 0xBF))
				byte_count = 4;

			if (byte_count) {
				__utf8_to_utf16_handlers[byte_count](wch,utf8);
				wch = &wch[byte_count >> 2];
				wch++;
			} else {
				return NT_STATUS_ILLEGAL_CHARACTER;
			}
		}

		*wch++ = 0;
	}

	if (wch == ubound)
		return NT_STATUS_BUFFER_TOO_SMALL;

	*wch++ = 0;
	*warrv = 0;
	*bytes_written = sizeof(wchar16_t) * (wch - buffer);

	return NT_STATUS_SUCCESS;
}