红联Linux门户
Linux帮助

Ubuntu下实现UTF8编码转为Unicode编码 C程序

发布时间:2017-02-11 10:13:23来源:linux网站作者:_那个谁
Ubuntu下实现UTF8编码转为Unicode编码 C程序:
unicode.c
 
#include <stdio.h>
#include<string.h>
#include"unicode.h"
unsigned char output[4];
//单字utf8编码长度 字节
int get_utf8_size(unsigned char Input)  
{
int firstch=Input;
int temp = 0x80;
int num = 0;
while (temp & firstch)
{
num++;
temp = (temp >> 1);
}
return num;
}
//单字utf8 to unicode
int one_utf8_to_unicode(unsigned char* input, int utfbytes)  
{   
bzero(output,4);
// b1 表示UTF-8编码的pInput中的高字节, b2 表示次高字节, ...  
unsigned char b1, b2, b3, b4, b5, b6;  
switch ( utfbytes)  
{  
case 0:  
*output = *input;
*(output+1) = 0;    
utfbytes += 1;  
break;  
case 2:  
b1 = *input;  
b2 = *(input+1);  
if ( (b2 & 0xE0) != 0x80 )  //此高位10xx xxxx
return 0;  
*output     = (b1 << 6) + (b2 & 0x3F);
*(output+1) = (b1 >> 2) & 0x07;  //2位范围0000 0080-0000 07ff
break;  
case 3:  
b1 = *input;  
b2 = *(input+1);  
b3 = *(input+2);  
if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80) )  
return 0;  
*output     = (b2 << 6) + (b3 & 0x3F);  
*(output+1) = (b1 << 4) + ((b2 >> 2) & 0x0F);  
break;  
case 4:  
b1 = *input;  
b2 = *(input+1);  
b3 = *(input+2);  
b4 = *(input+3); 
if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)  
|| ((b4 & 0xC0) != 0x80) )  
return 0;  
*output     = (b3 << 6) + (b4 & 0x3F);  
*(output+1) = (b2 << 4) + ((b3 >> 2) & 0x0F);  
*(output+2) = ((b1 << 2) & 0x1C)  + ((b2 >> 4) & 0x03);  
break;  
case 5:  
b1 = *input;  
b2 = *(input+1);  
b3 = *(input+2);  
b4 = *(input+3); 
b5 = *(input+4); 
if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)  
|| ((b4 & 0xC0) != 0x80) || ((b5 & 0xC0) != 0x80) )  
return 0;  
*output     = (b4 << 6) + (b5 & 0x3F);  
*(output+1) = (b3 << 4) + ((b4 >> 2) & 0x0F);  
*(output+2) = (b2 << 2) + ((b3 >> 4) & 0x03);  
*(output+3) = (b1 << 6);  
break;  
case 6:  
b1 = *input;  
b2 = *(input+1);  
b3 = *(input+2);  
b4 = *(input+3); 
b5 = *(input+4); 
b6 = *(input+5); 
if ( ((b2 & 0xC0) != 0x80) || ((b3 & 0xC0) != 0x80)  
|| ((b4 & 0xC0) != 0x80) || ((b5 & 0xC0) != 0x80)  
|| ((b6 & 0xC0) != 0x80) )  
return 0;  
*output     = (b5 << 6) + (b6 & 0x3F);  
*(output+1) = (b5 << 4) + ((b6 >> 2) & 0x0F);  
*(output+2) = (b3 << 2) + ((b4 >> 4) & 0x03);  
*(output+3) = ((b1 << 6) & 0x40) + (b2 & 0x3F);  
break;  
default:  
return 0;  
break;  
return utfbytes; 
//inbyte inputs字节
int utf8_to_unicode(unsigned char* inputs,unsigned char* outputs,int inbyte)
{
unsigned char midInput[6]={0,0,0,0,0,0};
int i,bytes=0;
int midbyte=0;
int j=0;
int readbytes=0;
int readlenth=0;
int endbytes=inbyte;
while( endbytes>0 )
{
bytes = get_utf8_size( *(inputs+readbytes) );
if(bytes>=0)
readlenth++;
midbyte=bytes;  
if(0==bytes)
midbyte=1;
for(i=0;i<midbyte;i++)
midInput[i]=*(inputs+readbytes + i);
one_utf8_to_unicode(midInput,bytes);
*(outputs+j)=output[1];
*(outputs+j+1)=output[0];
j=j+2;  
readbytes+=midbyte;
endbytes=inbyte-readbytes;
}       
return readlenth;
 
unicode.h文件:
 
#ifndef _UNICODE_H_
#define _UNICODE_H_
int get_utf8_size(unsigned char Input);//单utf8编码字节数
int one_utf8_to_unicode(unsigned char *input,int utfbytes);  
int utf8_to_unicode(unsigned char* inputs,unsigned char* outputs,int inbyte);
#endif
 
main.c文件:
 
#include <stdio.h>
#include<string.h>
#include"unicode.h"
unsigned char out[1024];
void main()
{
int len=0;  //utf8编码        unicode编码
unsigned char pss[]={
0x74,  //0074
0xe7,0x9f,0xa5,    //77e5  
0xe4,0xb9,0x8e,    //4e4e  
0x49,  //0049
0x4e,  //004e  N
0xe6,0x97,0xa5,    //65e5
0xe6,0x8a,0xa5,    //62a5
0xE8,0xBF,0x85,    //8FC5
0xE8,0x83,0xBD,    //80fd
0x58   //0058  X
};
len=utf8_to_unicode(pss,out,18);
int j=0;
int k;   
for(k=0;k<len*2;k++)
{
printf("%.2x",out[k]);
if(k%2>0)
printf("    -%d\n",++j);
}  
}
 
makefile文件:
 
mainFile = main.c  unicode.c
object = unicode
all:$(object)
$(object):$(mainFile)
gcc -o $(object) $(mainFile) -lm -pthread -lrt
clean:
rm $(object) 
 
代码完成!
 
ubuntu下运行终端,进入该文件保存目录,输入make回车完成编译,生成unicode可执行文件,输入./unicode回车执行。
 
执行结果:
 
snail@ubuntu:~/桌面/c/utf8-unicode$ make
gcc -o unicode main.c  unicode.c -lm -pthread -lrt
snail@ubuntu:~/桌面/c/utf8-unicode$ ./unicode
0074    -1
77e5    -2
4e4e    -3
0049    -4
004e    -5
65e5    -6
62a5    -7
8fc5    -8
snail@ubuntu:~/桌面/c/utf8-unicode$
 
本文永久更新地址:http://www.linuxdiyf.com/linux/28328.html