简介
Base64编码是将任何类型的数据转换成ASCII码的可见字符,然后接收端再反向解码,得到原始的数据。最早的的Base是用于发送Email内容的。
经过Base64转换之后的数据大小变大了,为原数据的4/3大小。但是方便了传输,比如由于base64的编码中没有<>等特殊字符,可以不用转义扫描,直接放在XML中,放在MIME中,甚至直接不经过转义扫描存进数据库中。由于有这些方便的特性,即使数据量变大,base64编码还是被广泛使用。
编码原理
每个字节8位,每次取出3个字节,也就是3 x 8 = 24 位。然后每次从此24位中取出6位,然后在前端补2位0,组成新的8位,也就是一个字节。这样就将3个字节转换成了4个字节。由于前面两位都是0,所以转换后的每个字节能表示的最大数字为63, 也就是说转换后的每个字节只可能是0-63中的一个数字。
然后根据规范给出的Base64索引表,将1-63 这64个数字转换成"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"中的一个。
当最后取出3个字节不够时,不够的位置补0,并且最后少一个字节时编码的最后加一个“=”,少两个字节时加两个"="
解码原理
解码是编码的反向过程,每次取出4个字节,然后将每个字节的字符转换成原始Base64索引表对应的索引数字,也就是编码时3字节转换成4字节的转换结果。然后使用位操作将每字节前2位去掉,重新转换成3字节。需要注意的是最后对于结尾“=”的处理。
代码实现
static const char Base64[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
static const char Pad64 = '=';
/* (From RFC1521 and draft-ietf-dnssec-secext-03.txt)
The following encoding technique is taken from RFC 1521 by Borenstein
and Freed. It is reproduced here in a slightly edited form for
convenience.
A 65-character subset of US-ASCII is used, enabling 6 bits to be
represented per printable character. (The extra 65th character, "=",
is used to signify a special processing function.)
The encoding process represents 24-bit groups of input bits as output
strings of 4 encoded characters. Proceeding from left to right, a
24-bit input group is formed by concatenating 3 8-bit input groups.
These 24 bits are then treated as 4 concatenated 6-bit groups, each
of which is translated into a single digit in the base64 alphabet.
Each 6-bit group is used as an index into an array of 64 printable
characters. The character referenced by the index is placed in the
output string.
Table 1: The Base64 Alphabet
Value Encoding Value Encoding Value Encoding Value Encoding
0 A 17 R 34 i 51 z
1 B 18 S 35 j 52 0
2 C 19 T 36 k 53 1
3 D 20 U 37 l 54 2
4 E 21 V 38 m 55 3
5 F 22 W 39 n 56 4
6 G 23 X 40 o 57 5
7 H 24 Y 41 p 58 6
8 I 25 Z 42 q 59 7
9 J 26 a 43 r 60 8
10 K 27 b 44 s 61 9
11 L 28 c 45 t 62 +
12 M 29 d 46 u 63 /
13 N 30 e 47 v
14 O 31 f 48 w (pad) =
15 P 32 g 49 x
16 Q 33 h 50 y
Special processing is performed if fewer than 24 bits are available
at the end of the data being encoded. A full encoding quantum is
always completed at the end of a quantity. When fewer than 24 input
bits are available in an input group, zero bits are added (on the
right) to form an integral number of 6-bit groups. Padding at the
end of the data is performed using the '=' character.
Since all base64 input is an integral number of octets, only the
-------------------------------------------------
following cases can arise:
(1) the final quantum of encoding input is an integral
multiple of 24 bits; here, the final unit of encoded
output will be an integral multiple of 4 characters
with no "=" padding,
(2) the final quantum of encoding input is exactly 8 bits;
here, the final unit of encoded output will be two
characters followed by two "=" padding characters, or
(3) the final quantum of encoding input is exactly 16 bits;
here, the final unit of encoded output will be three
characters followed by one "=" padding character.
*/
int b64_ntop(u_char const *src, size_t srclength, char *target, size_t targsize)
{
size_t datalength = 0;
u_char input[3] = { 0, 0, 0 }; /* make compiler happy */
u_char output[4];
size_t i;
assert(src != NULL);
assert(target != NULL);
while (2 < srclength) {
input[0] = *src++;
input[1] = *src++;
input[2] = *src++;
srclength -= 3;
output[0] = (u_int32_t)input[0] >> 2;
output[1] = ((u_int32_t)(input[0] & 0x03) << 4) +
((u_int32_t)input[1] >> 4);
output[2] = ((u_int32_t)(input[1] & 0x0f) << 2) +
((u_int32_t)input[2] >> 6);
output[3] = input[2] & 0x3f;
assert(output[0] < 64);
assert(output[1] < 64);
assert(output[2] < 64);
assert(output[3] < 64);
if (datalength + 4 > targsize)
return (-1);
target[datalength++] = Base64[output[0]];
target[datalength++] = Base64[output[1]];
target[datalength++] = Base64[output[2]];
target[datalength++] = Base64[output[3]];
}
/* Now we worry about padding. */
if (0 != srclength) {
/* Get what's left. */
input[0] = input[1] = input[2] = '\0';
for (i = 0; i < srclength; i++)
input[i] = *src++;
output[0] = (u_int32_t)input[0] >> 2;
output[1] = ((u_int32_t)(input[0] & 0x03) << 4) +
((u_int32_t)input[1] >> 4);
output[2] = ((u_int32_t)(input[1] & 0x0f) << 2) +
((u_int32_t)input[2] >> 6);
assert(output[0] < 64);
assert(output[1] < 64);
assert(output[2] < 64);
if (datalength + 4 > targsize)
return (-1);
target[datalength++] = Base64[output[0]];
target[datalength++] = Base64[output[1]];
if (srclength == 1)
target[datalength++] = Pad64;
else
target[datalength++] = Base64[output[2]];
target[datalength++] = Pad64;
}
if (datalength >= targsize)
return (-1);
target[datalength] = '\0'; /* Returned value doesn't count \0. */
return (datalength);
}
/* skips all whitespace anywhere.
converts characters, four at a time, starting at (or after)
src from base - 64 numbers into three 8 bit bytes in the target area.
it returns the number of data bytes stored at the target, or -1 on error.
*/
int b64_pton(char const *src, u_char *target, size_t targsize)
{
size_t tarindex;
int state, ch;
char *pos;
assert(src != NULL);
assert(target != NULL);
state = 0;
tarindex = 0;
while ((ch = (u_char) *src++) != '\0') {
if (isspace(ch)) /* Skip whitespace anywhere. */
continue;
if (ch == Pad64)
break;
pos = strchr(Base64, ch);
if (pos == 0) /* A non-base64 character. */
return (-1);
switch (state) {
case 0:
if (target) {
if (tarindex >= targsize)
return (-1);
target[tarindex] = (pos - Base64) << 2;
}
state = 1;
break;
case 1:
if (target) {
if (tarindex + 1 >= targsize)
return (-1);
target[tarindex] |=
(u_int32_t)(pos - Base64) >> 4;
target[tarindex+1] = ((pos - Base64) & 0x0f)
<< 4 ;
}
tarindex++;
state = 2;
break;
case 2:
if (target) {
if (tarindex + 1 >= targsize)
return (-1);
target[tarindex] |=
(u_int32_t)(pos - Base64) >> 2;
target[tarindex+1] = ((pos - Base64) & 0x03)
<< 6;
}
tarindex++;
state = 3;
break;
case 3:
if (target) {
if (tarindex >= targsize)
return (-1);
target[tarindex] |= (pos - Base64);
}
tarindex++;
state = 0;
break;
default:
abort();
}
}
/*
* We are done decoding Base-64 chars. Let's see if we ended
* on a byte boundary, and/or with erroneous trailing characters.
*/
if (ch == Pad64) { /* We got a pad char. */
ch = *src++; /* Skip it, get next. */
switch (state) {
case 0: /* Invalid = in first position */
case 1: /* Invalid = in second position */
return (-1);
case 2: /* Valid, means one byte of info */
/* Skip any number of spaces. */
for (; ch != '\0'; ch = (u_char) *src++)
if (!isspace(ch))
break;
/* Make sure there is another trailing = sign. */
if (ch != Pad64)
return (-1);
ch = *src++; /* Skip the = */
/* Fall through to "single trailing =" case. */
/* FALLTHROUGH */
case 3: /* Valid, means two bytes of info */
/*
* We know this char is an =. Is there anything but
* whitespace after it?
*/
for (; ch != '\0'; ch = (u_char) *src++)
if (!isspace(ch))
return (-1);
/*
* Now make sure for cases 2 and 3 that the "extra"
* bits that slopped past the last full byte were
* zeros. If we don't check them, they become a
* subliminal channel.
*/
if (target && target[tarindex] != 0)
return (-1);
}
} else {
/*
* We ended by seeing the end of the string. Make sure we
* have no partial bytes lying around.
*/
if (state != 0)
return (-1);
}
return (tarindex);
}
测试代码
int main()
{
unsigned char data[200];
unsigned char data2[400];
int i=0;
int ret;
printf("src:\n");
for(i=0;i<200;i++)
{
data[i]=i;
}
for(i=0;i<200;i++)
{
printf("%02x,",data[i]);
if((i+1)%16 ==0)
{
printf("\n");
}
}
unsigned char enstr[1024];
memset(enstr,0,1024);
printf("\nexe:\n");
ret =b64_ntop(data, 200, enstr, 400);
printf("ret=%d\n%s\n",ret,enstr);
//base64_encode(data,200,enstr);
//base64_decode(enstr,data2);
ret =b64_pton(enstr, data2, 400);
printf("ret=%d\nresult:\n",ret);
for(i=0;i<200;i++)
{
printf("%02x,",data2[i]);
if((i+1)%16 ==0)
{
printf("\n");
}
}
printf("\n");
}
其他
base64在不同的场景也有一个差别,有的编码结果中每76个字符加入一个换行。这也是正确的。以上的代码示例中没有加入,如需加入只需计数,定期加入换行符即可。
编码结果的大小在不加入换行符时是原来长度的4/3,但是需要注意精确malloc内存时,需要 (org_len + 3) * 4 / 3,如果需要增加结尾的'\0',需要再加1位。或者直接org_len *4 / 3 + 2