summaryrefslogtreecommitdiff
path: root/tools/4/source/src/pkg2zip_aes_x86.c
blob: de3a05a1a5bc7c6ee94345e5dfb6b76b6687dcb8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#include "pkg2zip_aes.h"

#include <string.h>
#include <wmmintrin.h> // AESNI
#include <tmmintrin.h> // SSSE3
#include "pkg2zip_aes_x86.h"

#define AES128_INIT(ctx, x, rcon)           \
{                                           \
    __m128i a, b;                           \
    _mm_store_si128(ctx, x);                \
    a = _mm_aeskeygenassist_si128(x, rcon); \
    a = _mm_shuffle_epi32(a, 0xff);         \
    b = _mm_slli_si128(x, 4);               \
    x = _mm_xor_si128(x, b);                \
    b = _mm_slli_si128(b, 4);               \
    x = _mm_xor_si128(x, b);                \
    b = _mm_slli_si128(b, 4);               \
    x = _mm_xor_si128(x, b);                \
    x = _mm_xor_si128(x, a);                \
}

/*void region_xor_sse(unsigned char* dst, unsigned char* src, int block_size)
{
  __m128i* dst_ptr = (__m128i*)dst;

   __m128i xmm1 = _mm_load_si128((__m128i*)src);
   __m128i xmm2 = _mm_load_si128(dst_ptr);

    xmm2 = _mm_xor_si128(xmm1, xmm2);
    _mm_store_si128(dst_ptr, xmm2);
}*/

void region_xor_sse(unsigned char* dst, unsigned char* src, int len)
{
    unsigned char * restrict p1 = __builtin_assume_aligned(src, 16);
    unsigned char * restrict p2 = __builtin_assume_aligned(dst, 16);

    unsigned int i;
    for (i = 0; i < len; ++i)
        p2[i] = p1[i] ^ p2[i];
}

void xor1_sse(unsigned char *dest, unsigned char *src1, unsigned char *src2, int size)
{
	int i;
	for(i = 0; i < size; i++)
	{
		dest[i] = src1[i] ^ src2[i];
	}
}

void aes128_init_x86(aes128_key* ctx, const uint8_t* key)
{
    __m128i* ekey = (__m128i*)ctx->key;

    __m128i x = _mm_loadu_si128((const __m128i*)key);
    AES128_INIT(ekey + 0, x, 0x01);
    AES128_INIT(ekey + 1, x, 0x02);
    AES128_INIT(ekey + 2, x, 0x04);
    AES128_INIT(ekey + 3, x, 0x08);
    AES128_INIT(ekey + 4, x, 0x10);
    AES128_INIT(ekey + 5, x, 0x20);
    AES128_INIT(ekey + 6, x, 0x40);
    AES128_INIT(ekey + 7, x, 0x80);
    AES128_INIT(ekey + 8, x, 0x1b);
    AES128_INIT(ekey + 9, x, 0x36);
    _mm_store_si128(ekey + 10, x);
}

void aes128_init_dec_x86(aes128_key* ctx, const uint8_t* key)
{
    aes128_key enc;
    aes128_init_x86(&enc, key);

    const __m128i* ekey = (__m128i*)&enc.key;
    __m128i* dkey = (__m128i*)&ctx->key;

    _mm_store_si128(dkey + 10, _mm_load_si128(ekey + 0));
    for (size_t i = 1; i < 10; i++)
    {
        _mm_store_si128(dkey + 10 - i, _mm_aesimc_si128(_mm_load_si128(ekey + i)));
    }
    _mm_store_si128(dkey + 0, _mm_load_si128(ekey + 10));
}

static __m128i aes128_encrypt_x86(__m128i input, const __m128i* key)
{
    __m128i tmp = _mm_xor_si128(input, _mm_load_si128(key + 0));
    tmp = _mm_aesenc_si128(tmp, _mm_load_si128(key + 1));
    tmp = _mm_aesenc_si128(tmp, _mm_load_si128(key + 2));
    tmp = _mm_aesenc_si128(tmp, _mm_load_si128(key + 3));
    tmp = _mm_aesenc_si128(tmp, _mm_load_si128(key + 4));
    tmp = _mm_aesenc_si128(tmp, _mm_load_si128(key + 5));
    tmp = _mm_aesenc_si128(tmp, _mm_load_si128(key + 6));
    tmp = _mm_aesenc_si128(tmp, _mm_load_si128(key + 7));
    tmp = _mm_aesenc_si128(tmp, _mm_load_si128(key + 8));
    tmp = _mm_aesenc_si128(tmp, _mm_load_si128(key + 9));
    return _mm_aesenclast_si128(tmp, _mm_load_si128(key + 10));
}

static __m128i aes128_decrypt_x86(__m128i input, const __m128i* key)
{
    __m128i tmp = _mm_xor_si128(input, _mm_load_si128(key + 0));
    tmp = _mm_aesdec_si128(tmp, _mm_load_si128(key + 1));
    tmp = _mm_aesdec_si128(tmp, _mm_load_si128(key + 2));
    tmp = _mm_aesdec_si128(tmp, _mm_load_si128(key + 3));
    tmp = _mm_aesdec_si128(tmp, _mm_load_si128(key + 4));
    tmp = _mm_aesdec_si128(tmp, _mm_load_si128(key + 5));
    tmp = _mm_aesdec_si128(tmp, _mm_load_si128(key + 6));
    tmp = _mm_aesdec_si128(tmp, _mm_load_si128(key + 7));
    tmp = _mm_aesdec_si128(tmp, _mm_load_si128(key + 8));
    tmp = _mm_aesdec_si128(tmp, _mm_load_si128(key + 9));
    return _mm_aesdeclast_si128(tmp, _mm_load_si128(key + 10));
}

void aes128_ecb_encrypt_x86(const aes128_key* ctx, const uint8_t* input, uint8_t* output)
{
    const __m128i* key = (__m128i*)ctx->key;
    __m128i tmp = aes128_encrypt_x86(_mm_loadu_si128((const __m128i*)input), key);
    _mm_storeu_si128((__m128i*)output, tmp);
}

void aes128_ecb_decrypt_x86(const aes128_key* ctx, const uint8_t* input, uint8_t* output)
{
    const __m128i* key = (__m128i*)ctx->key;
    __m128i tmp = aes128_decrypt_x86(_mm_loadu_si128((const __m128i*)input), key);
    _mm_storeu_si128((__m128i*)output, tmp);
}

static __m128i ctr_increment(__m128i counter)
{
    __m128i swap = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
    __m128i tmp = _mm_shuffle_epi8(counter, swap);
    tmp = _mm_add_epi64(tmp, _mm_set_epi32(0, 0, 0, 1));
    return _mm_shuffle_epi8(tmp, swap);
}

void aes128_ctr_xor_x86(const aes128_key* ctx, const uint8_t* iv, uint8_t* buffer, size_t size)
{
    const __m128i* key = (__m128i*)ctx->key;
    __m128i counter = _mm_loadu_si128((const __m128i*)iv);

    while (size >= 16)
    {
        __m128i block = aes128_encrypt_x86(counter, key);
        __m128i tmp = _mm_xor_si128(_mm_loadu_si128((const __m128i*)buffer), block);
        _mm_storeu_si128((__m128i*)buffer, tmp);

        counter = ctr_increment(counter);

        buffer += 16;
        size -= 16;
    }

    if (size != 0)
    {
        uint8_t full[16];
        memcpy(full, buffer, size);
        memset(full + size, 0, 16 - size);

        __m128i block = aes128_encrypt_x86(counter, key);
        __m128i tmp = _mm_xor_si128(_mm_loadu_si128((const __m128i*)full), block);
        _mm_storeu_si128((__m128i*)full, tmp);

        memcpy(buffer, full, size);
    }
}

void aes128_cmac_process_x86(const aes128_key* ctx, uint8_t* block, const uint8_t* buffer, uint32_t size)
{
    const __m128i* key = (__m128i*)ctx->key;
    __m128i* data = (__m128i*)buffer;

    __m128i tmp = _mm_loadu_si128((__m128i*)block);
    for (uint32_t i = 0; i < size; i += 16)
    {
        __m128i input = _mm_loadu_si128(data++);
        tmp = _mm_xor_si128(tmp, input);
        tmp = aes128_encrypt_x86(tmp, key);
    }
    _mm_storeu_si128((__m128i*)block, tmp);
}

void aes128_psp_decrypt_x86(const aes128_key* ctx, const uint8_t* prev, const uint8_t* block, uint8_t* buffer, uint32_t size)
{
    const __m128i* key = (__m128i*)ctx->key;
    __m128i one = _mm_setr_epi32(0, 0, 0, 1);

    __m128i x = _mm_load_si128((__m128i*)prev);
    __m128i y = _mm_load_si128((__m128i*)block);

    __m128i* data = (__m128i*)buffer;

    for (uint32_t i = 0; i < size; i += 16)
    {
        y = _mm_add_epi32(y, one);

        __m128i out = aes128_decrypt_x86(y, key);

        out = _mm_xor_si128(out, _mm_loadu_si128(data));
        out = _mm_xor_si128(out, x);
        _mm_storeu_si128(data++, out);
        x = y;
    }
}