From 868c7ef1f489b565e3469ff739654a7bf258773e Mon Sep 17 00:00:00 2001 From: Adam Langley Date: Mon, 2 Feb 2015 13:34:50 -0800 Subject: Don't assume alignment of ChaCha key on ARM. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When addressing [1], I checked the AEAD code but brain-farted: a key is aligned in that code, but it's the Poly1305 key, which doesn't matter here. It would be nice to align the ChaCha key too, but Android doesn't have |posix_memalign| in the versions that we care about. It does have |memalign|, but that's documented as "obsolete" and we don't have a concept of an Android OS yet and I don't want to add one just for this. So this change uses the buffer for loading the key again. (Note that we never used to check for alignment of the |key| before calling this. We must have gotten it for free somehow when checking the alignment of |in| and |out|. But there are clearly some paths that don't have an aligned key: https://code.google.com/p/chromium/issues/detail?id=454308.) At least the generation script started paying off immediately ☺. [1] https://boringssl-review.googlesource.com/#/c/3132/1/crypto/chacha/chacha_vec.c@185 Change-Id: I4f893ba0733440fddd453f9636cc2aeaf05076ed Reviewed-on: https://boringssl-review.googlesource.com/3270 Reviewed-by: Adam Langley --- crypto/chacha/chacha_vec.c | 19 +----- crypto/chacha/chacha_vec_arm.S | 151 ++++++++++++++++++++++------------------- 2 files changed, 83 insertions(+), 87 deletions(-) (limited to 'crypto') diff --git a/crypto/chacha/chacha_vec.c b/crypto/chacha/chacha_vec.c index 88830bc6..2b0fd9c5 100644 --- a/crypto/chacha/chacha_vec.c +++ b/crypto/chacha/chacha_vec.c @@ -40,6 +40,7 @@ typedef unsigned vec __attribute__((vector_size(16))); * This implementation supports parallel processing of multiple blocks, * including potentially using general-purpose registers. */ #if __ARM_NEON__ +#include #include #define GPR_TOO 1 #define VBPI 2 @@ -162,29 +163,15 @@ void CRYPTO_chacha_20( uint8_t alignment_buffer[16] __attribute__((aligned(16))); #endif vec s0, s1, s2, s3; -#if !defined(__ARM_NEON__) && !defined(__SSE2__) - __attribute__ ((aligned (16))) unsigned key[8], nonce[4]; -#endif __attribute__ ((aligned (16))) unsigned chacha_const[] = {0x61707865,0x3320646E,0x79622D32,0x6B206574}; -#if defined(__ARM_NEON__) || defined(__SSE2__) - kp = (unsigned *)key; -#else - ((vec *)key)[0] = REVV_BE(((vec *)key)[0]); - ((vec *)key)[1] = REVV_BE(((vec *)key)[1]); - nonce[0] = REVW_BE(((unsigned *)nonce)[0]); - nonce[1] = REVW_BE(((unsigned *)nonce)[1]); - nonce[2] = REVW_BE(((unsigned *)nonce)[2]); - nonce[3] = REVW_BE(((unsigned *)nonce)[3]); kp = (unsigned *)key; - np = (unsigned *)nonce; -#endif #if defined(__ARM_NEON__) np = (unsigned*) nonce; #endif s0 = LOAD_ALIGNED(chacha_const); - s1 = LOAD_ALIGNED(&((vec*)kp)[0]); - s2 = LOAD_ALIGNED(&((vec*)kp)[1]); + s1 = LOAD(&((vec*)kp)[0]); + s2 = LOAD(&((vec*)kp)[1]); s3 = (vec){ counter & 0xffffffff, #if __ARM_NEON__ || defined(OPENSSL_X86) diff --git a/crypto/chacha/chacha_vec_arm.S b/crypto/chacha/chacha_vec_arm.S index 15d4556c..a1fb541e 100644 --- a/crypto/chacha/chacha_vec_arm.S +++ b/crypto/chacha/chacha_vec_arm.S @@ -62,74 +62,88 @@ CRYPTO_chacha_20_neon: @ args = 8, pretend = 0, frame = 128 @ frame_needed = 1, uses_anonymous_args = 0 push {r4, r5, r6, r7, r8, r9, r10, fp, lr} - mov r4, r2 + mov ip, r3 vpush.64 {d8, d9, d10, d11, d12, d13, d14, d15} + mov r9, r2 + ldr r4, .L92+16 + mov fp, r0 + mov r10, r1 + mov lr, ip +.LPIC16: + add r4, pc movw r8, #43691 - movt r8, 43690 - mov ip, r3 - umull r8, r9, r4, r8 sub sp, sp, #132 add r7, sp, #0 sub sp, sp, #112 - mov fp, r0 - mov r10, r1 + movt r8, 43690 + str r0, [r7, #60] + str r1, [r7, #12] str r2, [r7, #8] + ldmia r4, {r0, r1, r2, r3} add r4, sp, #15 - ldr r2, .L92+16 bic r4, r4, #15 - ldr r5, [r7, #232] - add lr, r4, #64 -.LPIC16: - add r2, pc - str r0, [r7, #60] - str r1, [r7, #12] - str r3, [r7, #44] - ldmia r2, {r0, r1, r2, r3} - ldr r6, [r5] - str r4, [r7, #72] - ldr r5, [r5, #4] - ldr r4, [r7, #236] - str r6, [r7, #120] - str r5, [r7, #124] - str r4, [r7, #112] - stmia lr, {r0, r1, r2, r3} - movs r3, #0 - ldr r0, [r7, #72] - str r3, [r7, #116] - lsrs r3, r9, #7 + str ip, [r7, #44] + umull r8, r9, r9, r8 + mov r6, r4 + adds r4, r4, #64 + add r5, r6, #80 + str r5, [r7, #68] + stmia r4, {r0, r1, r2, r3} + movs r4, #0 + ldr r0, [ip] @ unaligned + ldr r1, [ip, #4] @ unaligned + ldr r2, [ip, #8] @ unaligned + ldr r3, [ip, #12] @ unaligned + vldr d24, [r6, #64] + vldr d25, [r6, #72] + str r4, [r7, #116] + mov r4, r5 + stmia r5!, {r0, r1, r2, r3} + ldr r0, [lr, #16]! @ unaligned + ldr r3, [r7, #232] + str r6, [r7, #72] + ldr r2, [lr, #8] @ unaligned + ldr ip, [r3] + ldr r6, [r7, #236] + ldr r1, [lr, #4] @ unaligned + ldr r3, [lr, #12] @ unaligned + ldr r5, [r7, #72] + vldr d26, [r5, #80] + vldr d27, [r5, #88] + str ip, [r7, #120] + stmia r4!, {r0, r1, r2, r3} + lsrs r2, r9, #7 + ldr r3, [r7, #232] + str r6, [r7, #112] + vldr d28, [r5, #80] + vldr d29, [r5, #88] + ldr r3, [r3, #4] + str r3, [r7, #124] vldr d22, [r7, #112] vldr d23, [r7, #120] - vldr d24, [r0, #64] - vldr d25, [r0, #72] - vld1.64 {d26-d27}, [ip:64] - vldr d28, [ip, #16] - vldr d29, [ip, #24] beq .L26 - ldr r1, [r0, #64] - lsls r2, r3, #8 - sub r3, r2, r3, lsl #6 - str r3, [r7, #4] - ldr r2, [r0, #72] - str r1, [r7, #40] - mov r1, r3 - ldr r3, [r0, #68] + lsls r1, r2, #8 + ldr r4, [r5, #64] + sub r2, r1, r2, lsl #6 + str r2, [r7, #4] vldr d0, .L92 vldr d1, .L92+8 - str r2, [r7, #32] - adds r2, r4, #2 - str r3, [r7, #36] - ldr r3, [r0, #76] - str r2, [r7, #48] - mov r2, r0 mov r0, fp + mov r1, r2 + ldr r2, [r5, #68] + str r4, [r7, #40] + ldr r4, [r5, #72] + str r2, [r7, #36] + ldr r2, [r5, #76] + str r4, [r7, #32] + adds r4, r6, #2 str r10, [r7, #64] - str r3, [r7, #28] - adds r3, r0, r1 - mov r1, r6 - str r3, [r7, #16] - add r3, r2, #80 - mov r2, r5 - str r3, [r7, #68] + str r2, [r7, #28] + adds r2, r0, r1 + mov r1, ip + str r2, [r7, #16] + mov r2, r3 + str r4, [r7, #48] .L4: ldr r0, [r7, #44] add r8, r7, #28 @@ -749,14 +763,12 @@ CRYPTO_chacha_20_neon: rsb fp, fp, r1 lsrs fp, fp, #6 beq .L6 - ldr r6, [r7, #72] ldr r5, [r7, #12] ldr r4, [r7, #16] - mov r3, r6 - adds r3, r3, #80 + ldr r6, [r7, #72] + ldr lr, [r7, #68] vldr d30, .L95 vldr d31, .L95+8 - mov lr, r3 str fp, [r7, #104] str fp, [r7, #108] .L8: @@ -1299,18 +1311,15 @@ CRYPTO_chacha_20_neon: vldm sp!, {d8-d15} pop {r4, r5, r6, r7, r8, r9, r10, fp, pc} .L89: - ldr r4, [r7, #12] + ldr r5, [r7, #12] vadd.i32 q12, q12, q10 - ldr r5, [r7, #72] + ldr r4, [r7, #68] cmp r9, #31 - ldr r0, [r4] @ unaligned - add r6, r5, #80 - ldr r1, [r4, #4] @ unaligned - ldr r2, [r4, #8] @ unaligned - mov r5, r6 - ldr r3, [r4, #12] @ unaligned - mov r4, r6 - str r6, [r7, #68] + ldr r0, [r5] @ unaligned + ldr r1, [r5, #4] @ unaligned + mov r6, r4 + ldr r2, [r5, #8] @ unaligned + ldr r3, [r5, #12] @ unaligned stmia r6!, {r0, r1, r2, r3} ldr r2, [r7, #72] ldr r6, [r7, #16] @@ -1355,13 +1364,13 @@ CRYPTO_chacha_20_neon: str fp, [r7, #16] b .L2 .L90: - ldr r3, [r7, #12] + mov r3, r5 + ldr r4, [r7, #68] + ldr r0, [r3, #16]! @ unaligned add lr, r1, #16 - mov r4, r5 - mov r6, r5 mov r5, r1 vadd.i32 q13, q13, q15 - ldr r0, [r3, #16]! @ unaligned + mov r6, r4 cmp r9, #47 ldr r1, [r3, #4] @ unaligned ldr r2, [r3, #8] @ unaligned -- cgit v1.2.3