MultiByteToWideChar、WideCharToMultiByteとBOMの関係
意外と記載がないのでメモ。
Windows APIで文字コード変換をする定番のAPIとしてMultiByteToWideChar
とWideCharToMultiByte
があります(「ANSI」とUTF-16の変換だけならATL/MFCのCW2A
やCA2W
が手軽ですが)。
しかし、これらのAPIは、公式ドキュメントにもBOMの扱いが書いてありません。
どうなるか分からないと扱えないので、試してみました。
環境はWindows 10(64bit)上のVisual Studio 2017のC++、コンソールアプリケーションです。
結論としては、変換元にBOMがある場合は、変換後もBOMつきとなります。変換元にBOMがなければ、変換後にもつきません。BOMはUTF-16とUTF-8で正しく変換されます。
#include <Windows.h> #include <stringapiset.h> #include <stdio.h> void printUtf16(const wchar_t* p) { int len = WideCharToMultiByte(CP_ACP, 0, p, -1, (LPSTR)nullptr, 0, NULL, NULL); char* pAnsi = new char[len]; WideCharToMultiByte(CP_ACP, 0, p, -1, pAnsi, len, NULL, NULL); printf(pAnsi); delete[] pAnsi; return; } int main() { printUtf16(L"----- 開始 -----\r\n"); // UTF-16 -> UTF-8 "あいう" wchar_t s16a[] = { (wchar_t)0x3042, (wchar_t)0x3044, (wchar_t)0x3046, (wchar_t)0 }; int len16a = WideCharToMultiByte(CP_UTF8, 0, s16a, -1, (LPSTR)0, 0, NULL, NULL); printf("len16a=%d\r\n", len16a); if (len16a > 0) { char* s8from16a = new char[len16a]; WideCharToMultiByte(CP_UTF8, 0, s16a, -1, s8from16a, len16a, NULL, NULL); printf("16->8 first bytes : %02x %02x %02x %02x %02x %02x\r\n", (unsigned char)(s8from16a[0]), (unsigned char)(s8from16a[1]), (unsigned char)(s8from16a[2]), (unsigned char)(s8from16a[3]), (unsigned char)(s8from16a[4]), (unsigned char)(s8from16a[5])); } // UTF-16 (includes BOM) -> UTF-8 "あいう" wchar_t s16b[] = { (wchar_t)0xfeff, (wchar_t)0x3042, (wchar_t)0x3044, (wchar_t)0x3046, (wchar_t)0 }; int len16b = WideCharToMultiByte(CP_UTF8, 0, s16b, -1, (LPSTR)0, 0, NULL, NULL); printf("len16b=%d\r\n", len16b); if (len16b > 0) { char* s8from16b = new char[len16b]; WideCharToMultiByte(CP_UTF8, 0, s16b, -1, s8from16b, len16b, NULL, NULL); printf("16->8 first bytes : %02x %02x %02x %02x %02x %02x\r\n", (unsigned char)(s8from16b[0]), (unsigned char)(s8from16b[1]), (unsigned char)(s8from16b[2]), (unsigned char)(s8from16b[3]), (unsigned char)(s8from16b[4]), (unsigned char)(s8from16b[5])); } // UTF8 -> UTF16 char s8a[] = { (char)0xE3, (char)0x81, (char)0x82, (char)0xE3, (char)0x81, (char)0x84, (char)0xE3, (char)0x81, (char)0x86, (char)0x00 }; int len8a = MultiByteToWideChar(CP_UTF8, 0, s8a, -1, (LPWSTR)0, 0); printf("len8a=%d\r\n", len8a); if (len8a > 0) { wchar_t* s16from8a = new wchar_t[len8a]; MultiByteToWideChar(CP_UTF8, 0, s8a, -1, s16from8a, len8a); printf("8->16 first bytes : %04x %04x %04x\r\n", s16from8a[0], s16from8a[1], s16from8a[2]); } // UTF8 (with BOM) -> UTF16 char s8b[] = { (char)0xEF, (char)0xBB, (char)0xBF, (char)0xE3, (char)0x81, (char)0x82, (char)0xE3, (char)0x81, (char)0x84, (char)0xE3, (char)0x81, (char)0x86, (char)0x00 }; int len8b = MultiByteToWideChar(CP_UTF8, 0, s8b, -1, (LPWSTR)0, 0); printf("len8b=%d\r\n", len8b); if (len8b > 0) { wchar_t* s16from8b = new wchar_t[len8b]; MultiByteToWideChar(CP_UTF8, 0, s8b, -1, s16from8b, len8b); printf("8->16 first bytes : %04x %04x %04x\r\n", s16from8b[0], s16from8b[1], s16from8b[2]); } printUtf16(L"----- 終了 -----\r\n"); } /* ----- 開始 ----- len16a=10 16->8 first bytes : e3 81 82 e3 81 84 len16b=13 16->8 first bytes : ef bb bf e3 81 82 len8a=4 8->16 first bytes : 3042 3044 3046 len8b=5 8->16 first bytes : feff 3042 3044 ----- 終了 ----- */