Don't split a surrogate pair over two output buffers. Bug 600974, r=emk, dbaron, a=blocking (a16956dc) · Commits · Tommy Webb / Tor Browser

intl/uconv/src/nsUTF8ToUnicode.cpp

+35 −4

Original line number	Diff line number	Diff line
		@@ -44,6 +44,15 @@

		#define UNICODE_BYTE_ORDER_MARK 0xFEFF

		static PRUnichar* EmitSurrogatePair(PRUint32 ucs4, PRUnichar* aDest)
		{
		NS_ASSERTION(ucs4 > 0xFFFF, "Should be a supplementary character");
		ucs4 -= 0x00010000;
		*aDest++ = 0xD800 \| (0x000003FF & (ucs4 >> 10));
		*aDest++ = 0xDC00 \| (0x000003FF & ucs4);
		return aDest;
		}

		//----------------------------------------------------------------------
		// Class nsUTF8ToUnicode [implementation]

		@@ -277,6 +286,22 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,

		nsresult res = NS_OK; // conversion result

		out = aDest;
		if (mState == 0xFF) {
		// Emit supplementary character left over from previous iteration. If the
		// buffer size is insufficient, treat it as an illegal character.
		if (aDestLen < 2) {
		NS_ERROR("Output buffer insufficient to hold supplementary character");
		mState = 0;
		return NS_ERROR_ILLEGAL_INPUT;
		}
		out = EmitSurrogatePair(mUcs4, out);
		mUcs4 = 0;
		mState = 0;
		mBytes = 1;
		mFirst = PR_FALSE;
		}

		// alias these locally for speed
		PRInt32 mUcs4 = this->mUcs4;
		PRUint8 mState = this->mState;
		@@ -288,7 +313,7 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
		if (mFirst && aSrcLen && (0 == (0x80 & (*aSrc))))
		mFirst = PR_FALSE;

		for (in = aSrc, out = aDest; ((in < inend) && (out < outend)); ++in) {
		for (in = aSrc; ((in < inend) && (out < outend)); ++in) {
		if (0 == mState) {
		// When mState is zero we expect either a US-ASCII character or a
		// multi-octet sequence.
		@@ -375,9 +400,15 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
		}
		if (mUcs4 > 0xFFFF) {
		// mUcs4 is in the range 0x10000 - 0x10FFFF. Output a UTF-16 pair
		mUcs4 -= 0x00010000;
		*out++ = 0xD800 \| (0x000003FF & (mUcs4 >> 10));
		*out++ = 0xDC00 \| (0x000003FF & mUcs4);
		if (out + 2 > outend) {
		// insufficient space left in the buffer. Keep mUcs4 for the
		// next iteration.
		mState = 0xFF;
		++in;
		res = NS_OK_UDEC_MOREOUTPUT;
		break;
		}
		out = EmitSurrogatePair(mUcs4, out);
		} else if (UNICODE_BYTE_ORDER_MARK != mUcs4 \|\| !mFirst) {
		// Don't output the BOM only if it is the first character
		*out++ = mUcs4;