all repos — mgba @ dab12cf5c674542cae0db7708c333035255fbc65

mGBA Game Boy Advance Emulator

src/third-party/lzma/AesOpt.c (view raw)

  1/* AesOpt.c -- Intel's AES
  22013-11-12 : Igor Pavlov : Public domain */
  3
  4#include "Precomp.h"
  5
  6#include "CpuArch.h"
  7
  8#ifdef MY_CPU_X86_OR_AMD64
  9#if _MSC_VER >= 1500
 10#define USE_INTEL_AES
 11#endif
 12#endif
 13
 14#ifdef USE_INTEL_AES
 15
 16#include <wmmintrin.h>
 17
 18void MY_FAST_CALL AesCbc_Encode_Intel(__m128i *p, __m128i *data, size_t numBlocks)
 19{
 20  __m128i m = *p;
 21  for (; numBlocks != 0; numBlocks--, data++)
 22  {
 23    UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
 24    const __m128i *w = p + 3;
 25    m = _mm_xor_si128(m, *data);
 26    m = _mm_xor_si128(m, p[2]);
 27    do
 28    {
 29      m = _mm_aesenc_si128(m, w[0]);
 30      m = _mm_aesenc_si128(m, w[1]);
 31      w += 2;
 32    }
 33    while (--numRounds2 != 0);
 34    m = _mm_aesenc_si128(m, w[0]);
 35    m = _mm_aesenclast_si128(m, w[1]);
 36    *data = m;
 37  }
 38  *p = m;
 39}
 40
 41#define NUM_WAYS 3
 42
 43#define AES_OP_W(op, n) { \
 44    const __m128i t = w[n]; \
 45    m0 = op(m0, t); \
 46    m1 = op(m1, t); \
 47    m2 = op(m2, t); \
 48    }
 49
 50#define AES_DEC(n) AES_OP_W(_mm_aesdec_si128, n)
 51#define AES_DEC_LAST(n) AES_OP_W(_mm_aesdeclast_si128, n)
 52#define AES_ENC(n) AES_OP_W(_mm_aesenc_si128, n)
 53#define AES_ENC_LAST(n) AES_OP_W(_mm_aesenclast_si128, n)
 54
 55void MY_FAST_CALL AesCbc_Decode_Intel(__m128i *p, __m128i *data, size_t numBlocks)
 56{
 57  __m128i iv = *p;
 58  for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)
 59  {
 60    UInt32 numRounds2 = *(const UInt32 *)(p + 1);
 61    const __m128i *w = p + numRounds2 * 2;
 62    __m128i m0, m1, m2;
 63    {
 64      const __m128i t = w[2];
 65      m0 = _mm_xor_si128(t, data[0]);
 66      m1 = _mm_xor_si128(t, data[1]);
 67      m2 = _mm_xor_si128(t, data[2]);
 68    }
 69    numRounds2--;
 70    do
 71    {
 72      AES_DEC(1)
 73      AES_DEC(0)
 74      w -= 2;
 75    }
 76    while (--numRounds2 != 0);
 77    AES_DEC(1)
 78    AES_DEC_LAST(0)
 79
 80    {
 81      __m128i t;
 82      t = _mm_xor_si128(m0, iv); iv = data[0]; data[0] = t;
 83      t = _mm_xor_si128(m1, iv); iv = data[1]; data[1] = t;
 84      t = _mm_xor_si128(m2, iv); iv = data[2]; data[2] = t;
 85    }
 86  }
 87  for (; numBlocks != 0; numBlocks--, data++)
 88  {
 89    UInt32 numRounds2 = *(const UInt32 *)(p + 1);
 90    const __m128i *w = p + numRounds2 * 2;
 91    __m128i m = _mm_xor_si128(w[2], *data);
 92    numRounds2--;
 93    do
 94    {
 95      m = _mm_aesdec_si128(m, w[1]);
 96      m = _mm_aesdec_si128(m, w[0]);
 97      w -= 2;
 98    }
 99    while (--numRounds2 != 0);
100    m = _mm_aesdec_si128(m, w[1]);
101    m = _mm_aesdeclast_si128(m, w[0]);
102
103    m = _mm_xor_si128(m, iv);
104    iv = *data;
105    *data = m;
106  }
107  *p = iv;
108}
109
110void MY_FAST_CALL AesCtr_Code_Intel(__m128i *p, __m128i *data, size_t numBlocks)
111{
112  __m128i ctr = *p;
113  __m128i one;
114  one.m128i_u64[0] = 1;
115  one.m128i_u64[1] = 0;
116  for (; numBlocks >= NUM_WAYS; numBlocks -= NUM_WAYS, data += NUM_WAYS)
117  {
118    UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
119    const __m128i *w = p;
120    __m128i m0, m1, m2;
121    {
122      const __m128i t = w[2];
123      ctr = _mm_add_epi64(ctr, one); m0 = _mm_xor_si128(ctr, t);
124      ctr = _mm_add_epi64(ctr, one); m1 = _mm_xor_si128(ctr, t);
125      ctr = _mm_add_epi64(ctr, one); m2 = _mm_xor_si128(ctr, t);
126    }
127    w += 3;
128    do
129    {
130      AES_ENC(0)
131      AES_ENC(1)
132      w += 2;
133    }
134    while (--numRounds2 != 0);
135    AES_ENC(0)
136    AES_ENC_LAST(1)
137    data[0] = _mm_xor_si128(data[0], m0);
138    data[1] = _mm_xor_si128(data[1], m1);
139    data[2] = _mm_xor_si128(data[2], m2);
140  }
141  for (; numBlocks != 0; numBlocks--, data++)
142  {
143    UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
144    const __m128i *w = p;
145    __m128i m;
146    ctr = _mm_add_epi64(ctr, one);
147    m = _mm_xor_si128(ctr, p[2]);
148    w += 3;
149    do
150    {
151      m = _mm_aesenc_si128(m, w[0]);
152      m = _mm_aesenc_si128(m, w[1]);
153      w += 2;
154    }
155    while (--numRounds2 != 0);
156    m = _mm_aesenc_si128(m, w[0]);
157    m = _mm_aesenclast_si128(m, w[1]);
158    *data = _mm_xor_si128(*data, m);
159  }
160  *p = ctr;
161}
162
163#else
164
165void MY_FAST_CALL AesCbc_Encode(UInt32 *ivAes, Byte *data, size_t numBlocks);
166void MY_FAST_CALL AesCbc_Decode(UInt32 *ivAes, Byte *data, size_t numBlocks);
167void MY_FAST_CALL AesCtr_Code(UInt32 *ivAes, Byte *data, size_t numBlocks);
168
169void MY_FAST_CALL AesCbc_Encode_Intel(UInt32 *p, Byte *data, size_t numBlocks)
170{
171  AesCbc_Encode(p, data, numBlocks);
172}
173
174void MY_FAST_CALL AesCbc_Decode_Intel(UInt32 *p, Byte *data, size_t numBlocks)
175{
176  AesCbc_Decode(p, data, numBlocks);
177}
178
179void MY_FAST_CALL AesCtr_Code_Intel(UInt32 *p, Byte *data, size_t numBlocks)
180{
181  AesCtr_Code(p, data, numBlocks);
182}
183
184#endif