crypto/aesgcm-ref-poly.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364

/*
 * Implementation of the GCM polynomial hash in pure software, but
 * based on a primitive that performs 64x64->128 bit polynomial
 * multiplication over GF(2).
 *
 * This implementation is technically correct (should even be
 * side-channel safe as far as I can see), but it's hopelessly slow,
 * so no live SSH connection should ever use it. Therefore, it's
 * deliberately not included in the lists in aesgcm-select.c. For pure
 * software GCM in live use, you want aesgcm-sw.c, and that's what the
 * selection system will choose.
 *
 * However, this implementation _is_ made available to testcrypt, so
 * all the GCM tests in cryptsuite.py are run over this as well as the
 * other implementations.
 *
 * The reason why this code exists at all is to act as a reference for
 * GCM implementations that use a CPU-specific polynomial multiply
 * intrinsic or asm statement. This version will run on whatever
 * platform you're trying to port to, and will generate all the same
 * intermediate results you expect the CPU-specific one to go through.
 * So you can insert parallel diagnostics in this version and in your
 * new version, to see where the two diverge.
 *
 * Also, this version is a good place to put long comments explaining
 * the entire strategy of this implementation and its rationale. That
 * avoids those comments having to be duplicated in the multiple
 * platform-specific implementations, which can focus on commenting
 * the way the local platform's idioms match up to this version, and
 * refer to this file for the explanation of the underlying technique.
 */

#include "ssh.h"
#include "aesgcm.h"

/*
 * Store a 128-bit value in the most convenient form standard C will
 * let us, namely two uint64_t giving its most and least significant
 * halves.
 */
typedef struct {
    uint64_t hi, lo;
} value128_t;

typedef struct aesgcm_ref_poly {
    AESGCM_COMMON_FIELDS;

    /*
     * The state of our GCM implementation is represented entirely by
     * three 128-bit values:
     */

    /*
     * The value at which we're evaluating the polynomial. The GCM
     * spec calls this 'H'. It's defined once at GCM key setup time,
     * by encrypting the all-zeroes value with our block cipher.
     */
    value128_t var;

    /*
     * Accumulator containing the result of evaluating the polynomial
     * so far.
     */
    value128_t acc;

    /*
     * The mask value that is XORed into the final value of 'acc' to
     * produce the output MAC. This is different for every MAC
     * generated, because its purpose is to ensure that no information
     * gathered from a legal MAC can be used to help the forgery of
     * another one, and that comparing two legal MACs gives you no
     * useful information about the text they cover, because in each
     * case, the masks are different and pseudorandom.
     */
    value128_t mask;
} aesgcm_ref_poly;

static bool aesgcm_ref_poly_available(void)
{
    return true;  /* pure software implementation, always available */
}

/*
 * Primitive function that takes two uint64_t values representing
 * polynomials, multiplies them, and returns a value128_t struct
 * containing the full product.
 *
 * Because the input polynomials have maximum degree 63, the output
 * has max degree 63+63 = 127, not 128. As a result, the topmost bit
 * of the output is always zero.
 *
 * The inside of this function is implemented in the simplest way,
 * with no attention paid to performance. The important feature of
 * this implementation is not what's _inside_ this function, but
 * what's _outside_ it: aesgcm_ref_poly_coeff() tries to minimise the
 * number of these operations.
 */
static value128_t pmul(uint64_t x, uint64_t y)
{
    value128_t r;
    r.hi = r.lo = 0;

    uint64_t bit = 1 & y;
    r.lo ^= x & -bit;

    for (unsigned i = 1; i < 64; i++) {
        bit = 1 & (y >> i);
        uint64_t z = x & -bit;
        r.lo ^= z << i;
        r.hi ^= z >> (64-i);
    }

    return r;
}

/*
 * OK, I promised a long comment explaining what's going on in this
 * implementation, and now it's time.
 *
 * The way AES-GCM _itself_ is defined by its own spec, its finite
 * field consists of polynomials over GF(2), constrained to be 128
 * bits long by reducing them modulo P = x^128 + x^7 + x^2 + x + 1.
 * Using the usual binary representation in which bit i is the
 * coefficient of x^i, that's 0x100000000000000000000000000000087.
 *
 * That is, whenever you multiply two polynomials and find a term
 * x^128, you can replace it with x^7+x^2+x+1. More generally,
 * x^(128+n) can be replaced with x^(7+n)+x^(2+n)+x^(1+n)+x^n. In
 * binary terms, a 1 bit at the 128th position or above is replaced by
 * 0x87 exactly 128 bits further down.
 *
 * So you'd think that multiplying two 128-bit polynomials mod P would
 * be a matter of generating their full 256-bit product in the form of
 * four words HI:HU:LU:LO, and then reducing it mod P by a two-stage
 * process of computing HI * 0x87 and XORing it into HU:LU, then
 * computing HU * 0x87 and XORing it into LU:LO.
 *
 * But it's not!
 *
 * The reason why not is because when AES-GCM is applied to SSH,
 * somehow the _bit_ order got reversed. A 16-byte block of data in
 * memory is converted into a polynomial by regarding bit 7 of the
 * first byte as the constant term, bit 0 of the first byte as the x^7
 * coefficient, ..., bit 0 of the last byte as the x^127 coefficient.
 * So if we load that 16-byte block as a big-endian 128-bit integer,
 * we end up with it representing a polynomial back to front, with the
 * constant term at the top and the x^127 bit at the bottom.
 *
 * Well, that _shouldn't_ be a problem, right? The nice thing about
 * polynomial multiplication is that it's essentially reversible. If
 * you reverse the order of the coefficients of two polynomials, then
 * the product of the reversed polys is exactly the reversal of the
 * product of the original ones. So we bit-reverse our modulo
 * polynomial to get 0x1c2000000000000000000000000000001, and we just
 * pretend we're working mod that instead.
 *
 * And that is basically what we're about to do. But there's one
 * complication, that arises from the semantics of the polynomial
 * multiplication function we're using as our primitive operation.
 *
 * That function multiplies two polynomials of degree at most 63, to
 * give one with degree at most 127. So it returns a 128-bit integer
 * whose low bit is the constant term, and its very highest bit is 0,
 * and its _next_ highest bit is the product of the high bits of the
 * two inputs.
 *
 * That operation is _not_ symmetric in bit-reversal. If you give it
 * the 64-bit-wise reversals of two polynomials P,Q, then its output
 * is not the 128-bit-wise reversal of their product PQ, because that
 * would require the constant term of PQ to appear in bit 127 of the
 * output, and in fact it appears in bit 126. So in fact, what we get
 * is offset by one bit from where we'd like it: it's the bit-reversal
 * of PQx, not of PQ.
 *
 * There's more than one way we could fix this. One approach would be
 * to work around this off-by-one error by taking the 128-bit output
 * of pmul() and shifting it left by a bit. Then it _is_ the bitwise
 * reversal of the 128-bit value we'd have wanted to get, and we could
 * implement the exact algorithm described above, in fully
 * bit-reversed form.
 *
 * But a 128-bit left shift is not a trivial operation in the vector
 * architectures that this code is acting as a reference for. So we'd
 * prefer to find a fix that doesn't need compensation during the
 * actual per-block multiplication step.
 *
 * If we did the obvious thing anyway - compute the unshifted 128-bit
 * product representing the bit-reversal of PQx, and reduce it mod
 * 0x1c2000000000000000000000000000001 - then we'd get a result which
 * is exactly what we want, except that it's got a factor of x in it
 * that we need to get rid of. The obvious answer is to divide by x
 * (which is legal and safe, since mod P, x is invertible).
 *
 * Dividing a 128-bit polynomial by x is easy in principle. Shift left
 * (because we're still bit-reversed), and if that shifted a 1 bit off
 * the top, XOR 0xc2000000000000000000000000000001 into the remaining
 * 128 bits.
 *
 * But we're back to having that expensive left shift. What can we do
 * about that?
 *
 * Happily, one of the two input values to our per-block multiply
 * operation is fixed! It's given to us at key setup stage, and never
 * changed until the next rekey. So if at key setup time we do this
 * left shift business _once_, replacing the logical value Q with Q/x,
 * then that exactly cancels out the unwanted factor of x that shows
 * up in our multiply operation. And now it doesn't matter that it's
 * expensive (in the sense of 'a few more instructions than you'd
 * like'), because it only happens once per SSH key exchange, not once
 * per 16 bytes of data transferred.
 */

static void aesgcm_ref_poly_setkey_impl(aesgcm_ref_poly *ctx,
                                        const unsigned char *var)
{
    /*
     * Key setup function. We copy the provided 16-byte 'var'
     * value into our polynomial. But, as discussed above, we also
     * need to divide it by x.
     */

    ctx->var.hi = GET_64BIT_MSB_FIRST(var);
    ctx->var.lo = GET_64BIT_MSB_FIRST(var + 8);

    uint64_t bit = 1 & (ctx->var.hi >> 63);
    ctx->var.hi = (ctx->var.hi << 1) ^ (ctx->var.lo >> 63);
    ctx->var.lo = (ctx->var.lo << 1) ^ bit;
    ctx->var.hi ^= 0xC200000000000000 & -bit;
}

static inline void aesgcm_ref_poly_setup(aesgcm_ref_poly *ctx,
                                         const unsigned char *mask)
{
    /*
     * Set up to start evaluating a particular MAC. Copy in the mask
     * value for this packet, and initialise acc to zero.
     */

    ctx->mask.hi = GET_64BIT_MSB_FIRST(mask);
    ctx->mask.lo = GET_64BIT_MSB_FIRST(mask + 8);
    ctx->acc.hi = ctx->acc.lo = 0;
}

static inline void aesgcm_ref_poly_coeff(aesgcm_ref_poly *ctx,
                                         const unsigned char *coeff)
{
    /*
     * One step of Horner's-rule polynomial evaluation (with each
     * coefficient of the polynomial being an element of GF(2^128),
     * itself composed of polynomials over GF(2) mod P).
     *
     * We take our accumulator value, add the incoming coefficient
     * (which means XOR, by GF(2) rules), and multiply by x (that is,
     * 'var').
     */

    /*
     * The addition first, which is easy.
     */
    ctx->acc.hi ^= GET_64BIT_MSB_FIRST(coeff);
    ctx->acc.lo ^= GET_64BIT_MSB_FIRST(coeff + 8);

    /*
     * First, create the 256-bit product of the two 128-bit
     * polynomials over GF(2) stored in ctx->acc and ctx->var.
     *
     * The obvious way to do this is by four smaller multiplications
     * of 64x64 -> 128 bits. But we can do better using a single
     * iteration of the Karatsuba technique, which is actually more
     * convenient in polynomials over GF(2) than it is in integers,
     * because there aren't all those awkward carries complicating
     * things.
     *
     * Letting B denote x^64, and imagining our two inputs are split
     * up into 64-bit chunks ah,al,bh,bl, the product we want is
     *
     *         (ah B + al) (bh B + bl)
     *       = (ah bh) B^2 + (al bh + ah bl) B + (al bl)
     *
     * which looks like four smaller multiplications of each of ah,al
     * with each of bh,bl. But Karatsuba's trick is to first compute
     *
     *         (ah + al) (bh + bl)
     *       = ah bh + al bh + ah bl + al bl
     *
     * and then subtract the terms (ah bh) and (al bl), which we had
     * to compute anyway, to get the middle two terms (al bh + ah bl)
     * which are our coefficient of B.
     *
     * This involves more bookkeeping instructions like XORs, but with
     * any luck those are faster than the main multiplication.
     */
    uint64_t ah = ctx->acc.hi, al = ctx->acc.lo;
    uint64_t bh = ctx->var.hi, bl = ctx->var.lo;
    /* Compute the outer two terms */
    value128_t lo = pmul(al, bl);
    value128_t hi = pmul(ah, bh);
    /* Compute the trick product (ah+al)(bh+bl) */
    value128_t md = pmul(ah ^ al, bh ^ bl);
    /* Subtract off the outer two terms to get md = al bh + ah bl */
    md.hi ^= lo.hi ^ hi.hi;
    md.lo ^= lo.lo ^ hi.lo;
    /* And add that into the 256-bit value given by hi * x^128 + lo */
    lo.hi ^= md.lo;
    hi.lo ^= md.hi;

    /*
     * OK. Now hi and lo together make up the 256-bit full product.
     * Now reduce it mod the reversal of the GCM modulus polynomial.
     * As discussed above, that's 0x1c2000000000000000000000000000001.
     *
     * We want the _topmost_ 128 bits of this, because we're working
     * in a bit-reversed world. So what we fundamentally want to do is
     * to take our 256-bit product, and add to it the product of its
     * low 128 bits with 0x1c2000000000000000000000000000001. Then the
     * top 128 bits will be the output we want.
     *
     * Since there's no carrying in this arithmetic, it's enough to
     * discard the 1 bit at the bottom of that, because it won't
     * affect anything in the half we're keeping. So it's enough to
     * add 0x1c2000000000000000000000000000000 * lo to (hi:lo).
     *
     * We can only work with 64 bits at a time, so the first thing we
     * do is to break that up:
     *
     *  - add 0x1c200000000000000 * lo.lo to (hi.lo : lo.hi)
     *  - add 0x1c200000000000000 * lo.hi to (hi.hi : hi.lo)
     *
     * But there's still a problem: 0x1c200000000000000 is just too
     * _big_ to fit in 64 bits. So we have to break it up into the low
     * 64 bits 0xc200000000000000, and its leading 1. So each of those
     * steps of the form 'add 0x1c200000000000000 * x to y:z' becomes
     * 'add 0xc200000000000000 * x to y:z' followed by 'add x to y',
     * the latter step dealing with the leading 1.
     */

    /* First step, adding to the middle two words of our number. After
     * this the lowest word (in lo.lo) is ignored. */
    value128_t r1 = pmul(0xC200000000000000, lo.lo);
    hi.lo ^= r1.hi ^ lo.lo;
    lo.hi ^= r1.lo;

    /* Second of those steps, adding to the top two words, and
     * discarding lo.hi. */
    value128_t r2 = pmul(0xC200000000000000, lo.hi);
    hi.hi ^= r2.hi ^ lo.hi;
    hi.lo ^= r2.lo;

    /* Now 'hi' is precisely what we have left. */
    ctx->acc = hi;
}

static inline void aesgcm_ref_poly_output(aesgcm_ref_poly *ctx,
                                          unsigned char *output)
{
    PUT_64BIT_MSB_FIRST(output, ctx->acc.hi ^ ctx->mask.hi);
    PUT_64BIT_MSB_FIRST(output + 8, ctx->acc.lo ^ ctx->mask.lo);
    smemclr(&ctx->acc, 16);
    smemclr(&ctx->mask, 16);
}

#define AESGCM_FLAVOUR ref_poly
#define AESGCM_NAME "reference polynomial-based implementation"
#include "aesgcm-footer.h"