iconv 文件编码转换
linux shell 配置文件中默认的字符集编码为UTF-8 。UTF-8是unicode的一种表达方式,gb2312是和unicode都是字符的编码方式,所以说gb2312跟utf-8的概念应该不是一个层次上的。在LINUX上进行编码转换时,可以利用iconv命令实现,这是针对文件的,即将指定文件从一种编码转换为另一种编码。
查了下iconv命令用法如下:
iconv [选项...] [文件...]有如下选项可用:
输入/输出格式规范:
-f, --from-code=名称 原始文本编码
-t, --to-code=名称 输出编码
信息:
-l, --list 列举所有已知的字符集
输出控制:
-c 从输出中忽略无效的字符
-o, --output=FILE 输出文件
-s, --silent 关闭警告
--verbose 打印进度信息
iconv -f utf-8 -t gb2312 /server_test/reports/software_.txt > /server_test/reports/software_asserts.txt
iconv函数族的头文件是iconv.h,使用前需包含之。
#include <iconv.h>
iconv函数族有三个函数,原型如下:
(1) iconv_t iconv_open(const char *tocode, const char *fromcode);
此函数说明将要进行哪两种编码的转换,tocode是目标编码,fromcode是原编码,该函数返回一个转换句柄,供以下两个函数使用。
(2) size_t iconv(iconv_t cd,char **inbuf,size_t *inbytesleft,char **outbuf,size_t *outbytesleft);
此函数从inbuf中读取字符,转换后输出到outbuf中,inbytesleft用以记录还未转换的字符数,outbytesleft用以记录输出缓冲的剩余空间。 (3) int iconv_close(iconv_t cd);
此函数用于关闭转换句柄,释放资源。
例子1: 用C语言实现的转换示例程序
/* f.c : 代码转换示例C程序 */
#include <iconv.h>
#define OUTLEN 255
main()
{
char *in_utf8 = "姝e?ㄥ??瑁?";
char *in_gb2312 = "正在安装";
char out[OUTLEN];
//unicode码转为gb2312码
rc = u2g(in_utf8,strlen(in_utf8),out,OUTLEN);
printf("unicode-->gb2312 out=%sn",out);
//gb2312码转为unicode码
rc = g2u(in_gb2312,strlen(in_gb2312),out,OUTLEN);
printf("gb2312-->unicode out=%sn",out);
}
//代码转换:从一种编码转为另一种编码
int code_convert(char *from_charset,char *to_charset,char *inbuf,int inlen,char *outbuf,int outlen)
{
iconv_t cd;
int rc;
char **pin = &inbuf;
char **pout = &outbuf;
cd = iconv_open(to_charset,from_charset);
if (cd==0) return -1;
memset(outbuf,0,outlen);
if (iconv(cd,pin,&inlen,pout,&outlen)==-1) return -1;
iconv_close(cd);
return 0;
}
//UNICODE码转为GB2312码
int u2g(char *inbuf,int inlen,char *outbuf,int outlen)
{
return code_convert("utf-8","gb2312",inbuf,inlen,outbuf,outlen);
}
//GB2312码转为UNICODE码
int g2u(char *inbuf,size_t inlen,char *outbuf,size_t outlen)
{
return code_convert("gb2312","utf-8",inbuf,inlen,outbuf,outlen);
}
例子2: 用C++语言实现的转换示例程序
/* f.cpp : 代码转换示例C++程序 */
#include <iconv.h>
#include <iostream>
#define OUTLEN 255
using namespace std;
// 代码转换操作类
class CodeConverter {
private:
iconv_t cd;
public:
// 构造
CodeConverter(const char *from_charset,const char *to_charset) {
cd = iconv_open(to_charset,from_charset);
}
// 析构
~CodeConverter() {
iconv_close(cd);
}
// 转换输出
int convert(char *inbuf,int inlen,char *outbuf,int outlen) {
char **pin = &inbuf;
char **pout = &outbuf;
memset(outbuf,0,outlen);
return iconv(cd,pin,(size_t *)&inlen,pout,(size_t *)&outlen);
}
};
int main(int argc, char **argv)
{
char *in_utf8 = "姝e?ㄥ??瑁?";
char *in_gb2312 = "正在安装";
char out[OUTLEN];
// utf-8-->gb2312
CodeConverter cc = CodeConverter("utf-8","gb2312");
cc.convert(in_utf8,strlen(in_utf8),out,OUTLEN);
cout << "utf-8-->gb2312 in=" << in_utf8 << ",out=" << out << endl;
// gb2312-->utf-8
CodeConverter cc2 = CodeConverter("gb2312","utf-8");
cc2.convert(in_gb2312,strlen(in_gb2312),out,OUTLEN);
cout << "gb2312-->utf-8 in=" << in_gb2312 << ",out=" << out << endl;
}
iconv的支持的编码有
|
http://worldant.blog.sohu.com/96069463.html
在LINUX上进行编码转换时,既可以利用iconv函数族编程实现,也可以利用iconv命令来实现,只不过后者是针对文件的,即将指定文件从一种编码转换为另一种编码。(1) 利用iconv函数族进行编码转换
iconv函数族的头文件是iconv.h,使用前需包含之。
#include <iconv.h>
iconv函数族有三个函数,原型如下:
- iconv_t iconv_open(const char *tocode, const char *fromcode);
- size_t iconv(iconv_t cd,char **inbuf,size_t *inbytesleft,char **outbuf,size_t *outbytesleft);
- int iconv_close(iconv_t cd);
(2) 利用iconv命令进行编码转换
iconv命令用于转换指定文件的编码,默认输出到标准输出设备,亦可指定输出文件。
用法: iconv [选项...] [文件...]
有如下选项可用:
输入/输出格式规范:
-f, --from-code=名称 原始文本编码
-t, --to-code=名称 输出编码
信息:
-l, --list 列举所有已知的字符集
输出控制:
-c 从输出中忽略无效的字符
-o, --output=FILE 输出文件
-s, --silent 关闭警告
--verbose 打印进度信息
-?, --help 给出该系统求助列表
--usage 给出简要的用法信息
-V, --version 打印程序版本号
例子:
iconv -f utf-8 -t gb2312 aaa.txt >bbb.txt
这个命令读取aaa.txt文件,从utf-8编码转换为gb2312编码,其输出定向到bbb.txt文件。
2.iconv实现通用语言编码转换(c++)
可以实现对任意的两个iconv支持的语言编码做互相转换,比如GB2312, GBK, GB18030, UTF-8, UTF-16, BIG5等.
下面这段程序,非常的稳定,测试了超过10万行的数十种编码的文本的转换都没有出问题。
#include < stdio.h >
#include < stdlib.h >
#include < string .h >
#include < iconv.h >
#ifndef ICONV_CONST
# define ICONV_CONST const
#endif
/* !
对字符串进行语言编码转换
param from 原始编码,比如"GB2312",的按照iconv支持的写
param to 转换的目的编码
param save 转换后的数据保存到这个指针里,需要在外部分配内存
param savelen 存储转换后数据的内存大小
param src 原始需要转换的字符串
param srclen 原始字符串长度
*/
int
convert( const char * from, const char * to, char * save, int savelen, char * src, int srclen)
{
iconv_t cd;
char * inbuf = src;
char * outbuf = save;
size_t outbufsize = savelen;
int status = 0 ;
size_t savesize = 0 ;
size_t inbufsize = srclen;
const char * inptr = inbuf;
size_t insize = inbufsize;
char * outptr = outbuf;
size_t outsize = outbufsize;
cd = iconv_open(to, from);
iconv(cd,NULL,NULL,NULL,NULL);
if (inbufsize == 0 ) {
status = - 1 ;
goto done;
}
while (insize > 0 ) {
size_t res = iconv(cd,(ICONV_CONST char ** ) & inptr, & insize, & outptr, & outsize);
if (outptr != outbuf) {
int saved_errno = errno;
int outsize = outptr - outbuf;
strncpy(save + savesize, outbuf, outsize);
errno = saved_errno;
}
if (res == (size_t)( - 1 )) {
if (errno == EILSEQ) {
int one = 1 ;
iconvctl(cd,ICONV_SET_DISCARD_ILSEQ, & one);
status = - 3 ;
} else if (errno == EINVAL) {
if (inbufsize == 0 ) {
status = - 4 ;
goto done;
} else {
break ;
}
} else if (errno == E2BIG) {
status = - 5 ;
goto done;
} else {
status = - 6 ;
goto done;
}
}
}
status = strlen(save);
done:
iconv_close(cd);
return status;
}
附:关于iconv的能力
It provides support for the encodings:
- European languages
- ASCII, ISO-8859-{1,2,3,4,5,7,9,10,13,14,15,16}, KOI8-R, KOI8-U, KOI8-RU, CP{1250,1251,1252,1253,1254,1257}, CP{850,866}, Mac{Roman,CentralEurope,Iceland,Croatian,Romania}, Mac{Cyrillic,Ukraine,Greek,Turkish}, Macintosh
- Semitic languages
- ISO-8859-{6,8}, CP{1255,1256}, CP862, Mac{Hebrew,Arabic}
- Japanese
- EUC-JP, SHIFT_JIS, CP932, ISO-2022-JP, ISO-2022-JP-2, ISO-2022-JP-1
- Chinese
- EUC-CN, HZ, GBK, CP936, GB18030, EUC-TW, BIG5, CP950, BIG5-HKSCS, BIG5-HKSCS:2001, BIG5-HKSCS:1999, ISO-2022-CN, ISO-2022-CN-EXT
- Korean
- EUC-KR, CP949, ISO-2022-KR, JOHAB
- Armenian
- ARMSCII-8
- Georgian
- Georgian-Academy, Georgian-PS
- Tajik
- KOI8-T
- Kazakh
- PT154, RK1048
- Thai
- ISO-8859-11, TIS-620, CP874, MacThai
- Laotian
- MuleLao-1, CP1133
- Vietnamese
- VISCII, TCVN, CP1258
- Platform specifics
- HP-ROMAN8, NEXTSTEP
- Full Unicode
-
UTF-8
UCS-2, UCS-2BE, UCS-2LE
UCS-4, UCS-4BE, UCS-4LE
UTF-16, UTF-16BE, UTF-16LE
UTF-32, UTF-32BE, UTF-32LE
UTF-7
C99, JAVA -
Full Unicode, in terms of
uint16_t
oruint32_t
(with machine dependent endianness and alignment) - UCS-2-INTERNAL, UCS-4-INTERNAL
- Locale dependent, in terms of `char' or `wchar_t' (with machine dependent endianness and alignment, and with OS and locale dependent semantics)
-
char, wchar_t
The empty encoding name "" is equivalent to "char": it denotes the locale dependent character encoding.
--enable-extra-encodings
, it also provides support for a few extra encodings:
- European languages
- CP{437,737,775,852,853,855,857,858,860,861,863,865,869,1125}
- Semitic languages
- CP864
- Japanese
- EUC-JISX0213, Shift_JISX0213, ISO-2022-JP-3
- Chinese
- BIG5-2003 (experimental)
- Turkmen
- TDS565
- Platform specifics
- ATARIST, RISCOS-LATIN1
It has also some limited support for transliteration, i.e. when a character cannot be represented in the target character set, it can be approximated through one or several similarly looking characters. Transliteration is activated when "//TRANSLIT" is appended to the target encoding name.
声明:
[参考]iconv
http://www.gnu.org/software/libiconv/documentation/libiconv/iconv.1.html
http://www.gnu.org/software/libiconv/
[参考] iconv实现通用语言编码转换
http://www.yuanma.org/data/2008/0503/article_3025.htm
[参考]linux下字符集编码转换轻松实现
http://blog.csdn.net/hnhbdss/archive/2007/11/30/1909456.aspx