6

RC4流密码

 2 years ago
source link: https://taardisaa.github.io/2022/03/10/RC4/
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.
neoserver,ios ssh client

复习一下。顺便配合最近稍微看过的SSE指令来看看优化后的代码是啥样的。

RC4流密码

BYTE S[256];
BYTE T[256];
BYTE key[256] = 0; // 最大界限值,key实际可任意长

初始化S盒

S中元素的值按升序被置为0-255

初始化T盒

同时建立一个临时向量T。

将密钥的值循环复制到T向量中。

通常和S盒的初始化在一个For循环中。

合在一起就是:

for i in range(256):
S[i] = i
T[i] = key[i % len(key)]

用T产生S的初始置换

j = 0;
for i in range(256):
j = (j + S[i] + T[i]) % 256
S[i], S[j] = S[j], S[i]
  1. 索引j使用S盒,T盒来更新自身
  2. S盒内部交换

具体啥原理我就不理解了,反正这种S盒替换属于比较非线性的操作了。

生成密钥流

i = 0;
j = 0;
for r in range(len(key)):
i = (i+1) % 256
j = (j+S[i]) % 256
S[i], S[j] = S[j], S[i]
t = (S[i]+S[j]) % 256
Data[r] ^= S[t]
# print(S[t]) 输出密钥流

Data应该就是输入的明文了。这里直接与明文异或,实现了加密。

由于是对称流密码,所以加解密流程一样。相应的输入的数据要换成密文即可。

//程序开始
#include<stdio.h>
#include<string.h>
typedef unsigned long ULONG;

/*初始化函数*/
void rc4_init(unsigned char*s, unsigned char*key, unsigned long Len)
{
int i = 0, j = 0;
char k[256] = { 0 };
unsigned char tmp = 0;
for (i = 0; i<256; i++)
{
s[i] = i;
k[i] = key[i%Len];
}
for (i = 0; i<256; i++)
{
j = (j + s[i] + k[i]) % 256;
tmp = s[i];
s[i] = s[j];//交换s[i]和s[j]
s[j] = tmp;
}
}

/*加解密*/
void rc4_crypt(unsigned char*s, unsigned char*Data, unsigned long Len)
{
int i = 0, j = 0, t = 0;
unsigned long k = 0;
unsigned char tmp;
for (k = 0; k<Len; k++)
{
i = (i + 1) % 256;
j = (j + s[i]) % 256;
tmp = s[i];
s[i] = s[j];//交换s[x]和s[y]
s[j] = tmp;
t = (s[i] + s[j]) % 256;
Data[k] ^= s[t];
}
}

int main()
{
unsigned char s[256] = { 0 }, s2[256] = { 0 };//S-box
char key[256] = { "justfortest" };
char pData[512] = "这是一个用来加密的数据Data";
unsigned long len = strlen(pData);
int i;

printf("pData=%s\n", pData);
printf("key=%s,length=%d\n\n", key, strlen(key));
rc4_init(s, (unsigned char*)key, strlen(key));//已经完成了初始化
printf("完成对S[i]的初始化,如下:\n\n");
for (i = 0; i<256; i++)
{
printf("%02X", s[i]);
if (i && (i + 1) % 16 == 0)putchar('\n');
}
printf("\n\n");
for (i = 0; i<256; i++)//用s2[i]暂时保留经过初始化的s[i],很重要的!!!
{
s2[i] = s[i];
}
printf("已经初始化,现在加密:\n\n");
rc4_crypt(s, (unsigned char*)pData, len);//加密
printf("pData=%s\n\n", pData);
printf("已经加密,现在解密:\n\n");
//rc4_init(s,(unsignedchar*)key,strlen(key));//初始化密钥
rc4_crypt(s2, (unsigned char*)pData, len);//解密
printf("pData=%s\n\n", pData);
return 0;
}

//程序完

实验发现,即使在msvc Release版本中,循环也没有进行任何优化。

尝试了一下写了一个SSE的demo。

#include <nmmintrin.h>
#include <Windows.h>
void rc4_init2(
PBYTE sBox,
PBYTE Key,
ULONG Len) {
__m128i* psBox = (__m128i*)sBox;

__m128i a = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
__m128i b = _mm_setr_epi8(16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16);

for (int i = 0; i < 256 >> 3; ++i) {
_mm_storeu_si128(psBox, a);
a = _mm_add_epi8(a, b);
++psBox;
}
}

暂时只写了一个很简单的初始化S盒,因为后面的话感觉优化挺难的,还是逐字节操作可能比较灵活。(所以这玩意速度真的快嘛?)

实际编译后会直接展开for循环。

v5[0] = (__int128)_mm_load_si128((const __m128i *)&xmmword_140002290);
v5[1] = (__int128)_mm_load_si128((const __m128i *)&xmmword_1400022A0);
v5[2] = (__int128)_mm_load_si128((const __m128i *)&xmmword_1400022B0);
v5[3] = (__int128)_mm_load_si128((const __m128i *)&xmmword_1400022C0);
v5[4] = (__int128)_mm_load_si128((const __m128i *)&xmmword_1400022D0);
v5[5] = (__int128)_mm_load_si128((const __m128i *)&xmmword_1400022E0);
v5[6] = (__int128)_mm_load_si128((const __m128i *)&xmmword_1400022F0);
v5[7] = (__int128)_mm_load_si128((const __m128i *)&xmmword_140002300);
v5[8] = (__int128)_mm_load_si128((const __m128i *)&xmmword_140002310);
v5[9] = (__int128)_mm_load_si128((const __m128i *)&xmmword_140002320);
v5[10] = (__int128)_mm_load_si128((const __m128i *)&xmmword_140002330);
v5[11] = (__int128)_mm_load_si128((const __m128i *)&xmmword_140002340);
v5[12] = (__int128)_mm_load_si128((const __m128i *)&xmmword_140002350);
v5[13] = (__int128)_mm_load_si128((const __m128i *)&xmmword_140002360);
v5[14] = (__int128)_mm_load_si128((const __m128i *)&xmmword_140002370);
v5[15] = (__int128)_mm_load_si128((const __m128i *)&xmmword_140002380);

从而增加了不少空间,但显然也加快了少许执行速度。

关于数据结构

  1. 明文长度 == 密钥长度 == 密文长度
  2. S盒大小:256字节
  3. T盒大小:256字节
  4. 密钥K:(注意区别于密钥)通常是16字节,可以是1-256字节。

关于算法特征

  1. 排列S盒与生成密钥流时皆有交换
  2. for循环跑256次
  3. 没有硬编码常量,故无法直接FindCrypt

已经不太安全了,有针对破解算法。

https://zh.wikipedia.org/wiki/RC4

Intel 向量指令集完全版

https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#


About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK