MultiByteToWideChar、WideCharToMultiByteとBOMの関係

意外と記載がないのでメモ。

Windows API文字コード変換をする定番のAPIとしてMultiByteToWideCharWideCharToMultiByteがあります(「ANSI」とUTF-16の変換だけならATL/MFCCW2ACA2Wが手軽ですが)。

しかし、これらのAPIは、公式ドキュメントにもBOMの扱いが書いてありません。

msdn.microsoft.com

msdn.microsoft.com

どうなるか分からないと扱えないので、試してみました。

環境はWindows 10(64bit)上のVisual Studio 2017のC++、コンソールアプリケーションです。

結論としては、変換元にBOMがある場合は、変換後もBOMつきとなります。変換元にBOMがなければ、変換後にもつきません。BOMはUTF-16UTF-8で正しく変換されます。

#include <Windows.h>
#include <stringapiset.h>
#include <stdio.h>

void printUtf16(const wchar_t* p) {
    int len = WideCharToMultiByte(CP_ACP, 0, p, -1, (LPSTR)nullptr, 0, NULL, NULL);
    char* pAnsi = new char[len];
    WideCharToMultiByte(CP_ACP, 0, p, -1, pAnsi, len, NULL, NULL);
    printf(pAnsi);
    delete[] pAnsi;
    return;
}

int main()
{
    printUtf16(L"----- 開始 -----\r\n");

    // UTF-16 -> UTF-8 "あいう"
    wchar_t s16a[] = { (wchar_t)0x3042, (wchar_t)0x3044, (wchar_t)0x3046, (wchar_t)0 };
    int len16a = WideCharToMultiByte(CP_UTF8, 0, s16a, -1, (LPSTR)0, 0, NULL, NULL);
    printf("len16a=%d\r\n", len16a);
    if (len16a > 0) {
        char* s8from16a = new char[len16a];
        WideCharToMultiByte(CP_UTF8, 0, s16a, -1, s8from16a, len16a, NULL, NULL);
        printf("16->8 first bytes : %02x %02x %02x %02x %02x %02x\r\n",
            (unsigned char)(s8from16a[0]), (unsigned char)(s8from16a[1]), (unsigned char)(s8from16a[2]),
            (unsigned char)(s8from16a[3]), (unsigned char)(s8from16a[4]), (unsigned char)(s8from16a[5]));
    }

    // UTF-16 (includes BOM) -> UTF-8 "あいう"
    wchar_t s16b[] = { (wchar_t)0xfeff, (wchar_t)0x3042, (wchar_t)0x3044, (wchar_t)0x3046, (wchar_t)0 };
    int len16b = WideCharToMultiByte(CP_UTF8, 0, s16b, -1, (LPSTR)0, 0, NULL, NULL);
    printf("len16b=%d\r\n", len16b);
    if (len16b > 0) {
        char* s8from16b = new char[len16b];
        WideCharToMultiByte(CP_UTF8, 0, s16b, -1, s8from16b, len16b, NULL, NULL);
        printf("16->8 first bytes : %02x %02x %02x %02x %02x %02x\r\n",
            (unsigned char)(s8from16b[0]), (unsigned char)(s8from16b[1]), (unsigned char)(s8from16b[2]),
            (unsigned char)(s8from16b[3]), (unsigned char)(s8from16b[4]), (unsigned char)(s8from16b[5]));
    }

    // UTF8 -> UTF16
    char s8a[] = {
        (char)0xE3, (char)0x81, (char)0x82,
        (char)0xE3, (char)0x81, (char)0x84,
        (char)0xE3, (char)0x81, (char)0x86,
        (char)0x00 };
    int len8a = MultiByteToWideChar(CP_UTF8, 0, s8a, -1, (LPWSTR)0, 0);
    printf("len8a=%d\r\n", len8a);
    if (len8a > 0) {
        wchar_t* s16from8a = new wchar_t[len8a];
        MultiByteToWideChar(CP_UTF8, 0, s8a, -1, s16from8a, len8a);
        printf("8->16 first bytes : %04x %04x %04x\r\n",
            s16from8a[0], s16from8a[1], s16from8a[2]);
    }

    // UTF8 (with BOM) -> UTF16
    char s8b[] = {
        (char)0xEF, (char)0xBB, (char)0xBF,
        (char)0xE3, (char)0x81, (char)0x82,
        (char)0xE3, (char)0x81, (char)0x84,
        (char)0xE3, (char)0x81, (char)0x86,
        (char)0x00 };
    int len8b = MultiByteToWideChar(CP_UTF8, 0, s8b, -1, (LPWSTR)0, 0);
    printf("len8b=%d\r\n", len8b);
    if (len8b > 0) {
        wchar_t* s16from8b = new wchar_t[len8b];
        MultiByteToWideChar(CP_UTF8, 0, s8b, -1, s16from8b, len8b);
        printf("8->16 first bytes : %04x %04x %04x\r\n",
            s16from8b[0], s16from8b[1], s16from8b[2]);
    }

    printUtf16(L"----- 終了 -----\r\n");
}

/*
----- 開始 -----
len16a=10
16->8 first bytes : e3 81 82 e3 81 84
len16b=13
16->8 first bytes : ef bb bf e3 81 82
len8a=4
8->16 first bytes : 3042 3044 3046
len8b=5
8->16 first bytes : feff 3042 3044
----- 終了 -----
*/