Ubuntu下实现UTF8编码转为Unicode编码 C程序:
unicode.c
#include <stdio.h>
#include<string.h>
#include"unicode.h"
unsigned char output[4];
//单字utf8编码长度 字节
int get_utf8_size(unsigned char Input)
{
int firstch=Input;
int temp = 0x80;
int num = 0;
while (temp & firstch)
{
num++;
temp = (temp >> 1);
}
return num;
}
//单字utf8 to unicode
int one_utf8_to_unicode(unsigned char* input, int utfbytes)
{
bzero(output,4);
// b1 表示UTF-8编码的pInput中的高字节, b2 表示次高字节, ...
unsigned char b1, b2, b3, b4, b5, b6;
switch ( utfbytes)
{
case 0:
*output = *input;
*(output+1) = 0;
utfbytes += 1;
break;
case 2:
b1 = *input;
b2 = *(input+1);
if ( (b2 & 0xE0) != 0x80 ) //此高位10xx xxxx
return 0;
*output = (b1 << 6) + (b2 & 0x3F);
*(output+1) = (b1 >> 2) & 0x07; //2位范围0000 0080-0000 07ff
break;
case 3:
b1 = *input;
b2 = *(input+1);
b3 = *(input+2);
if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80) )
return 0;
*output = (b2 << 6) + (b3 & 0x3F);
*(output+1) = (b1 << 4) + ((b2 >> 2) & 0x0F);
break;
case 4:
b1 = *input;
b2 = *(input+1);
b3 = *(input+2);
b4 = *(input+3);
if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)
|| ((b4 & 0xC0) != 0x80) )
return 0;
*output = (b3 << 6) + (b4 & 0x3F);
*(output+1) = (b2 << 4) + ((b3 >> 2) & 0x0F);
*(output+2) = ((b1 << 2) & 0x1C) + ((b2 >> 4) & 0x03);
break;
case 5:
b1 = *input;
b2 = *(input+1);
b3 = *(input+2);
b4 = *(input+3);
b5 = *(input+4);
if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)
|| ((b4 & 0xC0) != 0x80) || ((b5 & 0xC0) != 0x80) )
return 0;
*output = (b4 << 6) + (b5 & 0x3F);
*(output+1) = (b3 << 4) + ((b4 >> 2) & 0x0F);
*(output+2) = (b2 << 2) + ((b3 >> 4) & 0x03);
*(output+3) = (b1 << 6);
break;
case 6:
b1 = *input;
b2 = *(input+1);
b3 = *(input+2);
b4 = *(input+3);
b5 = *(input+4);
b6 = *(input+5);
if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)
|| ((b4 & 0xC0) != 0x80) || ((b5 & 0xC0) != 0x80)
|| ((b6 & 0xC0) != 0x80) )
return 0;
*output = (b5 << 6) + (b6 & 0x3F);
*(output+1) = (b5 << 4) + ((b6 >> 2) & 0x0F);
*(output+2) = (b3 << 2) + ((b4 >> 4) & 0x03);
*(output+3) = ((b1 << 6) & 0x40) + (b2 & 0x3F);
break;
default:
return 0;
break;
}
return utfbytes;
}
//inbyte inputs字节
int utf8_to_unicode(unsigned char* inputs,unsigned char* outputs,int inbyte)
{
unsigned char midInput[6]={0,0,0,0,0,0};
int i,bytes=0;
int midbyte=0;
int j=0;
int readbytes=0;
int readlenth=0;
int endbytes=inbyte;
while( endbytes>0 )
{
bytes = get_utf8_size( *(inputs+readbytes) );
if(bytes>=0)
readlenth++;
midbyte=bytes;
if(0==bytes)
midbyte=1;
for(i=0;i<midbyte;i++)
midInput[i]=*(inputs+readbytes + i);
one_utf8_to_unicode(midInput,bytes);
*(outputs+j)=output[1];
*(outputs+j+1)=output[0];
j=j+2;
readbytes+=midbyte;
endbytes=inbyte-readbytes;
}
return readlenth;
}
unicode.h文件:
#ifndef _UNICODE_H_
#define _UNICODE_H_
int get_utf8_size(unsigned char Input);//单utf8编码字节数
int one_utf8_to_unicode(unsigned char *input,int utfbytes);
int utf8_to_unicode(unsigned char* inputs,unsigned char* outputs,int inbyte);
#endif
main.c文件:
#include <stdio.h>
#include<string.h>
#include"unicode.h"
unsigned char out[1024];
void main()
{
int len=0; //utf8编码 unicode编码
unsigned char pss[]={
0x74, //0074
0xe7,0x9f,0xa5, //77e5
0xe4,0xb9,0x8e, //4e4e
0x49, //0049
0x4e, //004e N
0xe6,0x97,0xa5, //65e5
0xe6,0x8a,0xa5, //62a5
0xE8,0xBF,0x85, //8FC5
0xE8,0x83,0xBD, //80fd
0x58 //0058 X
};
len=utf8_to_unicode(pss,out,18);
int j=0;
int k;
for(k=0;k<len*2;k++)
{
printf("%.2x",out[k]);
if(k%2>0)
printf(" -%d\n",++j);
}
}
makefile文件:
mainFile = main.c unicode.c
object = unicode
all:$(object)
$(object):$(mainFile)
gcc -o $(object) $(mainFile) -lm -pthread -lrt
clean:
rm $(object)
代码完成!
ubuntu下运行终端,进入该文件保存目录,输入make回车完成编译,生成unicode可执行文件,输入./unicode回车执行。
执行结果:
snail@ubuntu:~/桌面/c/utf8-unicode$ make
gcc -o unicode main.c unicode.c -lm -pthread -lrt
snail@ubuntu:~/桌面/c/utf8-unicode$ ./unicode
0074 -1
77e5 -2
4e4e -3
0049 -4
004e -5
65e5 -6
62a5 -7
8fc5 -8
snail@ubuntu:~/桌面/c/utf8-unicode$