24.8 如何在GB2312与Unicode之间互相转换 Q: 在Windows上有WideCharToMultiByte()、MultiByteToWideChar()等函数可用,Unix 上有类似函数吗。 A: scz 请"man iconv_open",然后动用Google进行相关搜索。之所以没有建议"man iconv", 因为iconv(1)对应着一个命令,Linux上应该看iconv(3),Solaris上应该看iconv(3C), 其它系统各有变化,但"man iconv_open"总是一致的。 我们就iconv*()函数进行了简单测试: -------------------------------------------------------------------------- /* * gcc -DLinux -Wall -pipe -O3 -s -o iconv_test iconv_test.c * gcc -DSparc -Wall -pipe -O3 -s -o iconv_test iconv_test.c */ #if 0 我的一台FreeBSD测试机上居然没有/usr/include/iconv.h 而另一台AIX测试机上则根本不支持GB2312,编译时需指定库 gcc -DAix -Wall -pipe -O3 -s -o iconv_test iconv_test.c -liconv 显然,在各个缺省安装的Unix系统上iconv*()被支持的程度不一,所非自行安装 libiconv,否则这是一个高度不可移植的选择,决定放弃iconv*()。 本程序仅为测试目的存在,没有实用价值。 #endif #include #include #include #include #include static void outputBinary ( FILE *out, unsigned char *byteArray, size_t byteArrayLen ) { size_t offset, k, j, i; fprintf( out, "byteArray [ %u bytes ] ->\n", ( unsigned int )byteArrayLen ); if ( byteArrayLen <= 0 ) { return; } i = 0; offset = 0; for ( k = byteArrayLen / 16; k > 0; k--, offset += 16 ) { fprintf( out, "%08X ", ( unsigned int )offset ); for ( j = 0; j < 16; j++, i++ ) { if ( j == 8 ) { fprintf( out, "-%02X", byteArray[i] ); } else { fprintf( out, " %02X", byteArray[i] ); } } fprintf( out, " " ); i -= 16; for ( j = 0; j < 16; j++, i++ ) { /* * if ( isprint( (int)byteArray[i] ) ) */ #if 1 if ( ( byteArray[i] >= ' ' ) && ( byteArray[i] != 0x7F ) && ( byteArray[i] < 0xFF ) ) #else if ( ( byteArray[i] >= ' ' ) && ( byteArray[i] < 0x7F ) ) #endif { fprintf( out, "%c", byteArray[i] ); } else { fprintf( out, "." ); } } fprintf( out, "\n" ); } /* end of for */ k = byteArrayLen - i; if ( k <= 0 ) { return; } fprintf( out, "%08X ", ( unsigned int )offset ); for ( j = 0 ; j < k; j++, i++ ) { if ( j == 8 ) { fprintf( out, "-%02X", byteArray[i] ); } else { fprintf( out, " %02X", byteArray[i] ); } } i -= k; for ( j = 16 - k; j > 0; j-- ) { fprintf( out, " " ); } fprintf( out, " " ); for ( j = 0; j < k; j++, i++ ) { #if 1 if ( ( byteArray[i] >= ' ' ) && ( byteArray[i] != 0x7F ) && ( byteArray[i] < 0xFF ) ) #else if ( ( byteArray[i] >= ' ' ) && ( byteArray[i] < 0x7F ) ) #endif { fprintf( out, "%c", byteArray[i] ); } else { fprintf( out, "." ); } } fprintf( out, "\n" ); return; } /* end of outputBinary */ #ifdef Linux int main ( int argc, char * argv[] ) { int ret = EXIT_FAILURE; iconv_t cd = ( iconv_t )-1; char in[] = "GB2312与Unicode双向转换测试"; size_t inlen = sizeof( in ); char *inp = in; char out[sizeof(in)*2]; size_t outlen = sizeof( out ); char *outp = out; /* * 大小写不敏感 */ if ( ( iconv_t )-1 == ( cd = iconv_open( "UCS-2", "GB2312" ) ) ) { perror( "iconv_open() error" ); goto main_exit; } outputBinary ( stderr, in, sizeof( in ) ); if ( -1 == iconv( cd, &inp, &inlen, &outp, &outlen ) ) { perror( "iconv() error" ); goto main_exit; } outputBinary ( stderr, out, sizeof( out ) - outlen ); outlen = sizeof( out ) - outlen; outp = out; inlen = sizeof( in ); inp = in; if ( ( iconv_t )-1 != cd ) { iconv_close( cd ); cd = ( iconv_t )-1; } if ( ( iconv_t )-1 == ( cd = iconv_open( "GB2312", "UCS-2" ) ) ) { perror( "iconv_open() error" ); goto main_exit; } if ( -1 == iconv( cd, &outp, &outlen, &inp, &inlen ) ) { perror( "iconv() error" ); goto main_exit; } outputBinary ( stderr, in, sizeof( in ) - inlen ); ret = EXIT_SUCCESS; main_exit: if ( ( iconv_t )-1 != cd ) { iconv_close( cd ); cd = ( iconv_t )-1; } return( ret ); } /* end of main */ #if 0 [scz@ /home/scz/src]> ./iconv_test byteArray [ 28 bytes ] -> 00000000 47 42 32 33 31 32 D3 EB-55 6E 69 63 6F 64 65 CB GB2312与Unicode 00000010 AB CF F2 D7 AA BB BB B2-E2 CA D4 00 byteArray [ 42 bytes ] -> 00000000 47 00 42 00 32 00 33 00-31 00 32 00 0E 4E 55 00 G.B.2.3.1.2..NU. 00000010 6E 00 69 00 63 00 6F 00-64 00 65 00 CC 53 11 54 n.i.c.o.d.e. 00000020 6C 8F 62 63 4B 6D D5 8B-00 00 byteArray [ 28 bytes ] -> 00000000 47 42 32 33 31 32 D3 EB-55 6E 69 63 6F 64 65 CB GB2312与Unicode 00000010 AB CF F2 D7 AA BB BB B2-E2 CA D4 00 #endif #elif defined(Sparc) int main ( int argc, char * argv[] ) { int ret = EXIT_FAILURE; iconv_t cd = ( iconv_t )-1; char in[] = "GB2312与Unicode双向转换测试"; size_t inlen = sizeof( in ); char *inp = in; char out[sizeof(in)*2]; size_t outlen = sizeof( out ); char *outp = out; char xxx[sizeof(out)]; size_t xxxlen = sizeof( xxx ); char *xxxp = xxx; /* * 大小写敏感,SPARC/Solaris上无法直接从"gb2312"转到"UCS-2" */ if ( ( iconv_t )-1 == ( cd = iconv_open( "UTF-8", "gb2312" ) ) ) { perror( "iconv_open() error" ); goto main_exit; } outputBinary ( stderr, in, sizeof( in ) ); if ( -1 == iconv( cd, ( const char ** )&inp, &inlen, &outp, &outlen ) ) { perror( "iconv() error" ); goto main_exit; } outputBinary ( stderr, out, sizeof( out ) - outlen ); outlen = sizeof( out ) - outlen; outp = out; if ( ( iconv_t )-1 != cd ) { iconv_close( cd ); cd = ( iconv_t )-1; } if ( ( iconv_t )-1 == ( cd = iconv_open( "UCS-2", "UTF-8" ) ) ) { perror( "iconv_open() error" ); goto main_exit; } if ( -1 == iconv( cd, ( const char ** )&outp, &outlen, &xxxp, &xxxlen ) ) { perror( "iconv() error" ); goto main_exit; } outputBinary ( stderr, xxx, sizeof( xxx ) - xxxlen ); xxxlen = sizeof( xxx ) - xxxlen; xxxp = xxx; outlen = sizeof( out ); outp = out; if ( ( iconv_t )-1 != cd ) { iconv_close( cd ); cd = ( iconv_t )-1; } if ( ( iconv_t )-1 == ( cd = iconv_open( "UTF-8", "UCS-2" ) ) ) { perror( "iconv_open() error" ); goto main_exit; } if ( -1 == iconv( cd, ( const char ** )&xxxp, &xxxlen, &outp, &outlen ) ) { perror( "iconv() error" ); goto main_exit; } outputBinary ( stderr, out, sizeof( out ) - outlen ); outlen = sizeof( out ) - outlen; outp = out; inlen = sizeof( in ); inp = in; if ( ( iconv_t )-1 != cd ) { iconv_close( cd ); cd = ( iconv_t )-1; } if ( ( iconv_t )-1 == ( cd = iconv_open( "gb2312", "UTF-8" ) ) ) { perror( "iconv_open() error" ); goto main_exit; } if ( -1 == iconv( cd, ( const char ** )&outp, &outlen, &inp, &inlen ) ) { perror( "iconv() error" ); goto main_exit; } outputBinary ( stderr, in, sizeof( in ) - inlen ); ret = EXIT_SUCCESS; main_exit: if ( ( iconv_t )-1 != cd ) { iconv_close( cd ); cd = ( iconv_t )-1; } return( ret ); } /* end of main */ #if 0 [scz@ /export/home/scz/src]> ./iconv_test byteArray [ 28 bytes ] -> 00000000 47 42 32 33 31 32 D3 EB-55 6E 69 63 6F 64 65 CB GB2312与Unicode. 00000010 AB CF F2 D7 AA BB BB B2-E2 CA D4 00 byteArray [ 35 bytes ] -> 00000000 47 42 32 33 31 32 E4 B8-8E 55 6E 69 63 6F 64 65 GB2312 00000010 E5 8F 8C E5 90 91 E8 BD-AC E6 8D A2 E6 B5 8B E8 00000020 AF 95 00 byteArray [ 44 bytes ] -> 00000000 FE FF 00 47 00 42 00 32-00 33 00 31 00 32 4E 0E ..G.B.2.3.1.2N. 00000010 00 55 00 6E 00 69 00 63-00 6F 00 64 00 65 53 CC .U.n.i.c.o.d.eS. 00000020 54 11 8F 6C 63 62 6D 4B-8B D5 00 00 T. byteArray [ 35 bytes ] -> 00000000 47 42 32 33 31 32 E4 B8-8E 55 6E 69 63 6F 64 65 GB2312 00000010 E5 8F 8C E5 90 91 E8 BD-AC E6 8D A2 E6 B5 8B E8 00000020 AF 95 00 byteArray [ 28 bytes ] -> 00000000 47 42 32 33 31 32 D3 EB-55 6E 69 63 6F 64 65 CB GB2312与Unicode. 00000010 AB CF F2 D7 AA BB BB B2-E2 CA D4 00 #endif #endif -------------------------------------------------------------------------- 鉴于缺省情况下iconv*()不具有良好的可移植性,可以考虑其它方案。其中一种替代 方案是在.c中预定义相关码表,自行完成查表转换工作。但这种方案要考虑字节序的 问题,参看前面x86/Linux、SPARC/Solaris上的输出信息,也不具有良好可移植性。