1 files changed, 2499 insertions, 0 deletions
diff --git a/ruy/kernel_arm32.cc b/ruy/kernel_arm32.cc
new file mode 100644
index 0000000..d537cfe
--- /dev/null
+++ b/ruy/kernel_arm32.cc
@@ -0,0 +1,2499 @@
+/* Copyright 2019 Google LLC. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "ruy/kernel.h"
+#include "ruy/opt_set.h"
+#include "ruy/platform.h"
+#include "ruy/profiler/instrumentation.h"
+
+namespace ruy {
+
+#if RUY_PLATFORM(NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM)
+
+#define RUY_ASM_LABEL_STORE_UINT8 91
+#define RUY_ASM_LABEL_STORE_INT8 92
+#define RUY_ASM_LABEL_STORE_INT16 93
+#define RUY_ASM_LABEL_STORE_INT32 94
+#define RUY_ASM_LABEL_AFTER_STORE 99
+
+#define RUY_OFFSET_LHS_BASE_PTR 0
+#define RUY_OFFSET_RHS_BASE_PTR 4
+#define RUY_OFFSET_DST_BASE_PTR 8
+#define RUY_OFFSET_BIAS 12
+#define RUY_OFFSET_START_ROW 16
+#define RUY_OFFSET_START_COL 20
+#define RUY_OFFSET_LAST_ROW 24
+#define RUY_OFFSET_LAST_COL 28
+#define RUY_OFFSET_DST_ROWS 32
+#define RUY_OFFSET_DST_COLS 36
+#define RUY_OFFSET_LHS_STRIDE 40
+#define RUY_OFFSET_RHS_STRIDE 44
+#define RUY_OFFSET_DST_STRIDE 48
+#define RUY_OFFSET_DEPTH 52
+#define RUY_OFFSET_CLAMP_MIN 56
+#define RUY_OFFSET_CLAMP_MAX 60
+#define RUY_OFFSET_FLAGS 64
+
+#define RUY_STACK_OFFSET_SIZE 96
+#define RUY_STACK_OFFSET_DST_COL_PTR 0
+#define RUY_STACK_OFFSET_DST_PTR 16
+#define RUY_STACK_OFFSET_ROW 32
+#define RUY_STACK_OFFSET_COL 48
+#define RUY_STACK_OFFSET_LHS_COL_PTR 64
+#define RUY_STACK_OFFSET_RHS_COL_PTR 80
+
+template <typename Params>
+void CheckOffsetsInKernelParamsFloat32(const Params&) {
+  static_assert(offsetof(Params, lhs_base_ptr) == RUY_OFFSET_LHS_BASE_PTR, "");
+  static_assert(offsetof(Params, rhs_base_ptr) == RUY_OFFSET_RHS_BASE_PTR, "");
+  static_assert(offsetof(Params, dst_base_ptr) == RUY_OFFSET_DST_BASE_PTR, "");
+  static_assert(offsetof(Params, bias) == RUY_OFFSET_BIAS, "");
+  static_assert(offsetof(Params, start_row) == RUY_OFFSET_START_ROW, "");
+  static_assert(offsetof(Params, start_col) == RUY_OFFSET_START_COL, "");
+  static_assert(offsetof(Params, last_row) == RUY_OFFSET_LAST_ROW, "");
+  static_assert(offsetof(Params, last_col) == RUY_OFFSET_LAST_COL, "");
+  static_assert(offsetof(Params, dst_rows) == RUY_OFFSET_DST_ROWS, "");
+  static_assert(offsetof(Params, lhs_stride) == RUY_OFFSET_LHS_STRIDE, "");
+  static_assert(offsetof(Params, rhs_stride) == RUY_OFFSET_RHS_STRIDE, "");
+  static_assert(offsetof(Params, dst_stride) == RUY_OFFSET_DST_STRIDE, "");
+  static_assert(offsetof(Params, depth) == RUY_OFFSET_DEPTH, "");
+  static_assert(offsetof(Params, clamp_min) == RUY_OFFSET_CLAMP_MIN, "");
+  static_assert(offsetof(Params, clamp_max) == RUY_OFFSET_CLAMP_MAX, "");
+  static_assert(offsetof(Params, flags) == RUY_OFFSET_FLAGS, "");
+}
+
+// Float kernel for ARM32 out-of-order cores.
+// Just like Float 64 version, except accumulate in to 8x4 block to only
+// use 16 128-bit NEON registers. This is a "first pass" kernel and not
+// tuned. It is meant to run on out-of-order CPUs like the Krait 400 or A9.
+void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
+  CheckOffsetsInKernelParamsFloat32(params);
+  profiler::ScopeLabel label(
+      "Kernel (kNeon, optimized for out-of-order cores)");
+
+  const float* lhs_ptr = params.lhs_base_ptr;
+  const float* rhs_ptr = params.rhs_base_ptr;
+  // In ARM32 NEON, there are 16 128-bit "q" registers. These registers are
+  // each composed of two 64-bit "d" registers. The asm kernel below has the
+  // following NEON register allocation:
+  // Registers q3 -- q10 are accumulators. During accumulation,
+  // q0 -- q2 (d0 -- d5) are used to load data from LHS and RHS. q0 and q1
+  // are used to load a 8x1 block of LHS, and q2 is used to load a 1x4 block
+  // of RHS, like this:
+
+  //  Register layout in "q" registers:
+  //                                    RHS 1x4 block
+  //                           /--------------------------\
+  //                           |q2.s[0] ...      q2.s[3]  |
+  //                           \--------------------------/
+  //        LHS 8x1 block
+  //  /---------------------\  /---------------------     \
+  //  |        q0.s[0]      |  | q3.s[0]   ...    q9.s[0] |
+  //  |         ...         |  |  ...               ...   |
+  //  |        q0.s[3]      |  | q3.s[3]          q9.s[3] |
+  //  |        q1.s[0]      |  | q4.s[0]         q10.s[0] |
+  //  |         ...         |  |  ...      ...      ...   |
+  //  |        q1.s[3]      |  | q4.s[3]   ..    q10.s[3] |
+  //  \---------------------/  \--------------------------/
+  //                             accumulators 8x4 block
+  // q11, q14, q15 currently unused. q12 and q13 are used to load
+  // parameters used for the post-accumulation part of the kernel.
+  // For completeness, here is the register layout in "d" registers:
+  //                                    RHS 1x4 block
+  //                           /--------------------------\
+  //                           |d4[0]     ...       d5[1] |
+  //                           \--------------------------/
+  //        LHS 8x1 block
+  //  /---------------------\  /--------------------------\
+  //  |        d0[0]        |  | d6[0]    ...      d18[0] |
+  //  |         ...         |  |  ...               ...   |
+  //  |        d1[1]        |  | d7[1]             d19[1] |
+  //  |        d2[0]        |  | d8[0]             d20[0] |
+  //  |         ...         |  |  ...      ...      ...   |
+  //  |        d3[1]        |  | d9[1]     ...     d21[1] |
+  //  \---------------------/  \--------------------------/
+  //                             accumulators 8x4 block
+  asm volatile(
+#define RUY_MAKE_ZERO(reg) "vmov.f32 " #reg ", #0.0\n"
+
+        // clang-format off
+
+        // Load the first 32 bytes of LHS and RHS data.
+        // Load q0, q1
+        "vld1.32 {d0, d1}, [%[lhs_ptr]]!\n"
+        "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n"
+        RUY_PREFETCH_LOAD("pld [%[lhs_ptr]]\n")
+        // Load q2
+        "vld1.32 {d4, d5}, [%[rhs_ptr]]!\n"
+        RUY_PREFETCH_LOAD("pld [%[rhs_ptr]]\n")
+
+        "sub sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
+        "str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_START_COL) "]\n"
+        "str r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_RHS_BASE_PTR) "]\n"
+        "str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
+        // Clear accumulators.
+        RUY_MAKE_ZERO(q3)
+        RUY_MAKE_ZERO(q4)
+        RUY_MAKE_ZERO(q5)
+        RUY_MAKE_ZERO(q6)
+        RUY_MAKE_ZERO(q7)
+        RUY_MAKE_ZERO(q8)
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+
+        // r1 is the number of levels of depth that we have already loaded
+        // LHS and RHS data for. Corresponding to the initial ld1 instructions
+        // above, this is currently 1.
+        "mov r1, #1\n"
+
+        // Main loop of the whole GEMM, over rows and columns of the
+        // destination matrix.
+        "1:\n"
+
+        // Accumulation loop
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
+        "cmp r1, r2\n"
+        "beq 79f\n"
+
+        "2:\n"
+
+        "vmla.f32 q3, q0, d4[0]\n"
+        "vmla.f32 q5, q0, d4[1]\n"
+        "vmla.f32 q7, q0, d5[0]\n"
+        "vmla.f32 q9, q0, d5[1]\n"
+        "vld1.32 {d0, d1}, [%[lhs_ptr]]!\n" // Reload LHS
+
+        "vmla.f32 q4, q1, d4[0]\n"
+        "vmla.f32 q6, q1, d4[1]\n"
+        "vmla.f32 q8, q1, d5[0]\n"
+        "vmla.f32 q10, q1, d5[1]\n"
+        "vld1.32 {d2, d3}, [%[lhs_ptr]]!\n" // Reload LHS
+        RUY_PREFETCH_LOAD("pld [%[lhs_ptr]]\n")
+        "vld1.32 {d4, d5}, [%[rhs_ptr]]!\n" // Reload RHS
+        RUY_PREFETCH_LOAD("pld [%[rhs_ptr]]\n")
+
+        "add r1, r1, #1\n"
+        "cmp r1, r2\n"
+
+        "blt 2b\n"
+
+        "79:\n"
+
+        // End of the inner loop on depth. Now perform the remaining
+        // multiply-adds of the last level of depth, for which the LHS
+        // and RHS data is already loaded.
+
+        "vmla.f32 q3, q0, d4[0]\n"
+        "vmla.f32 q5, q0, d4[1]\n"
+        "vmla.f32 q7, q0, d5[0]\n"
+        "vmla.f32 q9, q0, d5[1]\n"
+
+        "vmla.f32 q4, q1, d4[0]\n"
+        "vmla.f32 q6, q1, d4[1]\n"
+        "vmla.f32 q8, q1, d5[0]\n"
+        "vmla.f32 q10, q1, d5[1]\n"
+
+        // End of accumulation. The registers q3 -- q10 contain the final
+        // float32 accumulator values of the current 8x8 destination block.
+        // We now have to compute the final values from these accumulators
+        // and advance to the next 8x8 block. We intertwine
+        // these two aspects whenever possible for optimal pipelining, both
+        // at the data flow level (prefetch data for next block as early as
+        // possible) and instruction pipelining level (some of the next-block
+        // work can dual-issue with some of the final work on the current
+        // block).
+
+        // Logic to advance to the next block in preparation for the next
+        // iteration of the main loop. For now, we only want to compute
+        // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
+        // not yet ready to update the values of row and col, as we still need
+        // the current values for the rest of the work on the current block.
+
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
+        "ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "cmp r1, r3\n"  // Have we finished the last row?
+
+        "bge 4f\n"      // If finished last row, go to 4
+        // Not finished last row: then advance to next row.
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+        "add r4, r4, r1, lsl #3\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+        "b 5f\n"
+        "4:\n"  // Finished last row...
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
+        // Go back to first row
+        "str r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+        // Now we need to advance to the next column. If we already
+        // finished the last column, then in principle we are done, however
+        // we can't just return here, as we need to allow the end work of the
+        // current block to complete. The good news is that at this point it
+        // doesn't matter what data we load for the next column, since
+        // we will exit from the main loop below before actually storing
+        // anything computed from that data.
+        "ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "cmp r8, r4\n"  // Have we finished the last column?
+        "bge 5f\n" // If yes, just carry on without updating the column pointer.
+        // Not finished last column: then advance to next column.
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
+        "ldr r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
+        "add r10, r10, r1, lsl #2\n"
+        "str r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
+        "5:\n"
+
+        // Set the LHS and RHS data pointers to the start of the columns just
+        // computed.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+        "mov %[lhs_ptr], r4\n"
+        "ldr r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
+        "mov %[rhs_ptr], r5\n"
+
+        // Load some parameters needed for the end work on current block.
+        "ldrb r4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
+
+        // Offset these base pointers as needed given the current row, col.
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "add r5, r1, r8, lsl #2\n"
+
+        "tst r4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
+        "it ne\n"
+        "movne r1, r5\n"
+
+        // Load 8 bias values.
+        "vld1.32 {d24, d25, d26, d27}, [r1]\n"
+
+        // Now that we know what LHS and RHS data the next iteration of the
+        // main loop will need to load, we start loading the first 32 bytes of
+        // each of LHS and RHS, into q0 -- q2, as we don't need q0 -- q2 anymore
+        // in the rest of the work on the current block.
+        // Load q0, q1
+        "vld1.32 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n"
+        RUY_PREFETCH_LOAD("pld [%[lhs_ptr]]\n")
+        // Load q2
+        "vld1.32 {d4, d5}, [%[rhs_ptr]]!\n"
+        RUY_PREFETCH_LOAD("pld [%[rhs_ptr]]\n")
+
+        // Perform the bias-addition (per the above, we have just folded into
+        // the bias the (depth * lhs_zero_point * rhs_zero_point) term.)
+        "vadd.f32 q3, q3, q12\n"
+        "vadd.f32 q4, q4, q13\n"
+        "vadd.f32 q5, q5, q12\n"
+        "vadd.f32 q6, q6, q13\n"
+        "vadd.f32 q7, q7, q12\n"
+        "vadd.f32 q8, q8, q13\n"
+        "vadd.f32 q9, q9, q12\n"
+        "vadd.f32 q10, q10, q13\n"
+
+        // Load the clamp_min, clamp_max bounds
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
+        "vdup.32 q12, r2\n"  // clamp_min
+        "vdup.32 q13, r3\n"  // clamp_max
+
+        // Apply the clamp_min bound
+        "vmax.f32 q3, q3, q12\n"
+        "vmax.f32 q4, q4, q12\n"
+        "vmax.f32 q5, q5, q12\n"
+        "vmax.f32 q6, q6, q12\n"
+        "vmax.f32 q7, q7, q12\n"
+        "vmax.f32 q8, q8, q12\n"
+        "vmax.f32 q9, q9, q12\n"
+        "vmax.f32 q10, q10, q12\n"
+
+        // Apply the clamp_max bound
+        "vmin.f32 q3, q3, q13\n"
+        "vmin.f32 q4, q4, q13\n"
+        "vmin.f32 q5, q5, q13\n"
+        "vmin.f32 q6, q6, q13\n"
+        "vmin.f32 q7, q7, q13\n"
+        "vmin.f32 q8, q8, q13\n"
+        "vmin.f32 q9, q9, q13\n"
+        "vmin.f32 q10, q10, q13\n"
+
+        // Compute how much of the 8x4 block of destination values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 8x4, there are some 8x8 blocks along the boundaries that do
+        // not fit entirely.
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "sub r1, r1, r8\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "sub r2, r2, r4\n"
+        "mov r3, #8\n"
+        "mov r5, #4\n"
+        "cmp r1, #8\n"
+        // Compute r1 = how many rows of the 8x4 block fit
+        "it gt\n"
+        "movgt r1, r3\n"
+        "cmp r2, #4\n"
+        // Compute r2 = how many cols of the 8x4 block fit
+        "it gt\n"
+        "movgt r2, r5\n"
+
+        // Test if r1==8 && r2 == 4, i.e. if all of the 8x4 block fits.
+        "cmp r1, r3\n"
+        "it eq\n"
+        "cmpeq r2, r5\n"
+        // Yes, all of the 8x4 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 8x4 block fits.
+        // Set (r3 address, r4 stride) to write to dst_tmp_buf
+        "mov r3, %[dst_tmp_buf]\n"
+        "mov r4, #32\n"
+        "b 31f\n"
+        "30:\n"
+        // Yes, all of the 8x4 block fits.
+        // Set (r3 address, r4 stride) to write directly to destination matrix.
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        "ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "mov r4, r5\n"
+        "31:\n"
+
+        // Write our float values to the destination described by
+        // (r3 address, r4 stride)
+        "vst1.32 {d6, d7, d8, d9}, [r3]\n"
+        "add r3, r3, r4\n"
+        RUY_MAKE_ZERO(q3)
+        RUY_MAKE_ZERO(q4)
+        "vst1.32 {d10, d11, d12, d13}, [r3]\n"
+        "add r3, r3, r4\n"
+        RUY_MAKE_ZERO(q5)
+        RUY_MAKE_ZERO(q6)
+        "vst1.32 {d14, d15, d16, d17}, [r3]\n"
+        "add r3, r3, r4\n"
+        RUY_MAKE_ZERO(q7)
+        RUY_MAKE_ZERO(q8)
+        "vst1.32 {d18, d19, d20, d21}, [r3]\n"
+        "add r3, r3, r4\n"
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+
+        // If all of the 8x4 block fits, we just finished writing it to the
+        // destination, so we skip the next part.
+        "beq 41f\n"
+        // Not all of the 8x8 block fits in the destination matrix.  We just
+        // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
+        // it to copy into the destination matrix the part that fits.
+        "ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        "mov r3, %[dst_tmp_buf]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "mov r6, #0\n"
+        "50:\n"
+        "mov r5, #0\n"
+        "51:\n"
+        "ldr r10, [r3, r5, lsl #2]\n"
+        "str r10, [r4, r5, lsl #2]\n"
+        "add r5, r5, #1\n"
+        "cmp r5, r1\n"
+        "blt 51b\n"
+        "add r6, r6, #1\n"
+        "add r3, r3, #32\n"
+        "add r4, r4, r8\n"
+        // r2 = how many cols of the 8x4 block fit
+        "cmp r6, r2\n"
+        "blt 50b\n"
+        "41:\n"
+        // Load dst_ptr, increment, and write back.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "add r4, r4, #32\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        // At this point we have completely finished writing values to the
+        // destination matrix for the current block.
+
+        // Reload some params --- we had used r3, r5, r10 for a few other things
+        // since the last time we had loaded them.
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
+        "ldr r6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
+
+        // Move to the next block of the destination matrix, for the next iter
+        // of the main loop.  Notice that lhs_col_ptr, rhs_col_ptr have already
+        // been updated earlier.
+        // Have we reached the end row?
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "cmp r8, r3\n"
+
+        "beq 20f\n"  // yes, end row.
+        // Not end row. Move to the next row.
+        "add r8, r8, #8\n"
+        // Store new value of row
+        "str r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+
+        "b 21f\n"
+        "20:\n"
+        // Was already at end row.
+        // Move back to first row.
+        "str r6, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        // Move to the next column.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "add r4, r4, #4\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+
+        "ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        "ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
+        // Increment dst_col_ptr by 4 * dst_stride (i.e. 4 columns)
+        "add r1, r1, r8, lsl #2\n"
+        // Store dst_col_ptr
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
+        // Store dst_ptr
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "21:\n"
+
+        // Main loop exit condition: have we hit the end column?
+        "ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "cmp r8, r4\n"
+
+        // r1 is the number of levels of depth that we have already loaded
+        // LHS and RHS data for. Corresponding to the initial ld1 instructions
+        // above, this is currently 1.
+        "mov r1, #1\n"
+
+        "ble 1b\n"
+
+        // Restore stack pointer.
+        "add sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
+
+        // clang-format on
+        : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr)
+        : [ params ] "r"(&params), [dst_tmp_buf] "r"(params.dst_tmp_buf)
+        // Clobber list must specify q registers (and not their constituent
+        // d registers). There is a (currently unexplained) slowdown if
+        // d registers are listed in the clobbers list.
+        : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r10", "cc",
+          "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+          "q9", "q10", "q12", "q13");
+}
+
+#undef RUY_MAKE_ZERO
+#undef RUY_STACK_OFFSET_SIZE
+#undef RUY_STACK_OFFSET_DST_COL_PTR
+#undef RUY_STACK_OFFSET_DST_PTR
+#undef RUY_STACK_OFFSET_ROW
+#undef RUY_STACK_OFFSET_COL
+#undef RUY_STACK_OFFSET_LHS_COL_PTR
+#undef RUY_STACK_OFFSET_RHS_COL_PTR
+
+#undef RUY_OFFSET_LHS_BASE_PTR
+#undef RUY_OFFSET_RHS_BASE_PTR
+#undef RUY_OFFSET_DST_BASE_PTR
+#undef RUY_OFFSET_BIAS
+#undef RUY_OFFSET_START_ROW
+#undef RUY_OFFSET_START_COL
+#undef RUY_OFFSET_LAST_ROW
+#undef RUY_OFFSET_LAST_COL
+#undef RUY_OFFSET_DST_ROWS
+#undef RUY_OFFSET_DST_COLS
+#undef RUY_OFFSET_LHS_STRIDE
+#undef RUY_OFFSET_RHS_STRIDE
+#undef RUY_OFFSET_DST_STRIDE
+#undef RUY_OFFSET_DEPTH
+#undef RUY_OFFSET_CLAMP_MIN
+#undef RUY_OFFSET_CLAMP_MAX
+#undef RUY_OFFSET_FLAGS
+
+#define RUY_OFFSET_BIAS 0
+#define RUY_OFFSET_LHS_SUMS 4
+#define RUY_OFFSET_RHS_SUMS 8
+#define RUY_OFFSET_LHS_BASE_PTR 12
+#define RUY_OFFSET_MULTIPLIER_FIXEDPOINT 16
+#define RUY_OFFSET_MULTIPLIER_EXPONENT 20
+#define RUY_OFFSET_RHS_BASE_PTR 24
+#define RUY_OFFSET_DST_BASE_PTR 28
+#define RUY_OFFSET_LHS_ZERO_POINT 32
+#define RUY_OFFSET_RHS_ZERO_POINT 36
+#define RUY_OFFSET_DST_ZERO_POINT 40
+#define RUY_OFFSET_PROD_ZP_DEPTH 44
+#define RUY_OFFSET_START_ROW 48
+#define RUY_OFFSET_START_COL 52
+#define RUY_OFFSET_LAST_ROW 56
+#define RUY_OFFSET_LAST_COL 60
+#define RUY_OFFSET_DST_ROWS 64
+#define RUY_OFFSET_DST_COLS 68
+#define RUY_OFFSET_LHS_STRIDE 72
+#define RUY_OFFSET_RHS_STRIDE 76
+#define RUY_OFFSET_DST_STRIDE 80
+#define RUY_OFFSET_DEPTH 84
+#define RUY_OFFSET_CLAMP_MIN 88
+#define RUY_OFFSET_CLAMP_MAX 92
+#define RUY_OFFSET_FLAGS 96
+#define RUY_OFFSET_DST_TYPE_ID 97
+
+#define RUY_STACK_OFFSET_SIZE 96
+#define RUY_STACK_OFFSET_DST_COL_PTR 0
+#define RUY_STACK_OFFSET_DST_PTR 16
+#define RUY_STACK_OFFSET_ROW 32
+#define RUY_STACK_OFFSET_COL 48
+#define RUY_STACK_OFFSET_LHS_COL_PTR 64
+#define RUY_STACK_OFFSET_RHS_COL_PTR 80
+
+template <typename Params>
+void CheckOffsetsInKernelParams8bit(const Params&) {
+  static_assert(offsetof(Params, lhs_zero_point) == RUY_OFFSET_LHS_ZERO_POINT,
+                "");
+  static_assert(offsetof(Params, rhs_zero_point) == RUY_OFFSET_RHS_ZERO_POINT,
+                "");
+  static_assert(offsetof(Params, dst_zero_point) == RUY_OFFSET_DST_ZERO_POINT,
+                "");
+  static_assert(offsetof(Params, prod_zp_depth) == RUY_OFFSET_PROD_ZP_DEPTH,
+                "");
+  static_assert(offsetof(Params, multiplier_fixedpoint) ==
+                    RUY_OFFSET_MULTIPLIER_FIXEDPOINT,
+                "");
+  static_assert(
+      offsetof(Params, multiplier_exponent) == RUY_OFFSET_MULTIPLIER_EXPONENT,
+      "");
+  static_assert(offsetof(Params, clamp_min) == RUY_OFFSET_CLAMP_MIN, "");
+  static_assert(offsetof(Params, clamp_max) == RUY_OFFSET_CLAMP_MAX, "");
+  static_assert(offsetof(Params, bias) == RUY_OFFSET_BIAS, "");
+  static_assert(offsetof(Params, lhs_sums) == RUY_OFFSET_LHS_SUMS, "");
+  static_assert(offsetof(Params, rhs_sums) == RUY_OFFSET_RHS_SUMS, "");
+  static_assert(offsetof(Params, flags) == RUY_OFFSET_FLAGS, "");
+  static_assert(offsetof(Params, lhs_base_ptr) == RUY_OFFSET_LHS_BASE_PTR, "");
+  static_assert(offsetof(Params, start_row) == RUY_OFFSET_START_ROW, "");
+  static_assert(offsetof(Params, last_row) == RUY_OFFSET_LAST_ROW, "");
+  static_assert(offsetof(Params, last_col) == RUY_OFFSET_LAST_COL, "");
+  static_assert(offsetof(Params, lhs_stride) == RUY_OFFSET_LHS_STRIDE, "");
+  static_assert(offsetof(Params, rhs_stride) == RUY_OFFSET_RHS_STRIDE, "");
+  static_assert(offsetof(Params, dst_stride) == RUY_OFFSET_DST_STRIDE, "");
+  static_assert(offsetof(Params, depth) == RUY_OFFSET_DEPTH, "");
+}
+
+// Fast-int8 kernel, ported from ARM 64 version.
+// Relevant target CPUs for this kernel include Krait 400 and A9,
+// since these are 32-bit, out-of-order CPUs.
+void Kernel8bitNeonOutOfOrder(const KernelParams8bit<4, 2>& params) {
+  profiler::ScopeLabel label(
+      "Kernel (kNeon, optimized for out-of-order cores)");
+
+  CheckOffsetsInKernelParams8bit(params);
+
+  const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
+  const std::int8_t* rhs_col_ptr = params.rhs_base_ptr;
+  const std::int8_t* lhs_ptr = lhs_col_ptr;
+  const std::int8_t* rhs_ptr = rhs_col_ptr;
+
+  // The asm kernel below has the following NEON register allocation:
+  //
+  // q6 - q13 are 128-bit (4x32b) accumulators.
+  // During accumulation, d0 -- d7 are used to load int8 data from LHS and
+  // d8 -- d11 from RHS:
+  //                                      int8 RHS 16x2 block
+  //                              /-----------------------------\
+  //                              |d8.b[0-7]   .....  d10.b[0-7]|
+  //                              |  ...                  ...   |
+  //                              |d9.b[0-7]   .....  d11.b[0-7]|
+  //                              \-----------------------------/
+  //    int8 LHS 4x16 block
+  //  /------------------------\  /-----------------------------\
+  //  |d0.b[0-7] ... d1.b[0-7] |  | q6         .....      q10   |
+  //  |d2.b[0-7] ... d3.b[0-7] |  | q7         .....      q11   |
+  //  (Reload d0, d1, d2, d3)
+  //  |d0.b[0-7] ... d1.b[0-7] |  | q8         .....      q12   |
+  //  |d2.b[0-7] ... d3.b[0-7] |  | q9         .....      q13   |
+  //  \------------------------/  \-----------------------------/
+  //                                128-bit accumulators 4x2 block
+  //
+  // No attempt had been made so far at implementing the RUY_OPT_MAX_STREAMING
+  // optimization for this kernel.
+  asm volatile(
+#define RUY_MAKE_ZERO(reg) "vmov.i32 " #reg ", #0x00000000\n"
+
+        // clang-format off
+
+        // Load the first 64 bytes of LHS and RHS data.
+        "vld1.8 {d0, d1}, [%[lhs_ptr]]!\n"
+        // Clear accumulators.
+        RUY_MAKE_ZERO(q6)
+        "vld1.8 {d2, d3}, [%[lhs_ptr]]!\n"
+        RUY_MAKE_ZERO(q8)
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+        "vld1.8 {d8, d9}, [%[rhs_ptr]]!\n"
+        RUY_MAKE_ZERO(q11)
+        "vld1.8 {d10, d11}, [%[rhs_ptr]]!\n"
+
+        "sub sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
+        RUY_MAKE_ZERO(q12)
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
+        RUY_MAKE_ZERO(q13)
+        "str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
+        RUY_MAKE_ZERO(q14)
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_START_COL) "]\n"
+        RUY_MAKE_ZERO(q15)
+        "str r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_RHS_BASE_PTR) "]\n"
+        "str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
+
+
+        // r1 is the number of levels of depth that we have already loaded
+        // LHS and RHS data for. Corresponding to the initial ld1 instructions
+        // above, this is currently 16.
+        "mov r1, #16\n"
+
+        // Main loop of the whole GEMM, over rows and columns of the
+        // destination matrix.
+        "1:\n"
+
+        // r1 is how many levels of depth we have already loaded
+        // data for, r10 is the total depth.
+        "ldr r10, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
+        "cmp r1, r10\n"
+        "beq 79f\n"
+
+        "2:\n"
+
+        // Mult, mult-acc in to q14, q15, q2, q3
+        "vmull.s8 q14, d0, d8\n"
+        "vmull.s8 q2, d0, d10\n"
+
+        "vmull.s8 q15, d2, d8\n"
+        "vmull.s8 q3, d2, d10\n"
+
+        "vmlal.s8 q14, d1, d9\n"
+        "vmlal.s8 q2, d1, d11\n"
+        "vmlal.s8 q15, d3, d9\n"
+        "vmlal.s8 q3, d3, d11\n"
+        "vld1.8 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n" // Reload LHS
+
+        // Then pairwise accumulate in to q6, q7, q10, q11
+        "vpadal.s16 q6, q14\n"
+        "vpadal.s16 q7, q15\n"
+        "vpadal.s16 q10, q2\n"
+        "vpadal.s16 q11, q3\n"
+
+        // Mult, mult-acc in to q14, q15, q2, q3
+        "vmull.s8 q14, d0, d8\n"
+        "vmull.s8 q2, d0, d10\n"
+
+        "vmull.s8 q15, d2, d8\n"
+        "vmull.s8 q3, d2, d10\n"
+
+        "vmlal.s8 q14, d1, d9\n"
+        "vmlal.s8 q2, d1, d11\n"
+        "vmlal.s8 q15, d3, d9\n"
+        "vmlal.s8 q3, d3, d11\n"
+        "vld1.8 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n" // Reload LHS
+
+        // Then pairwise accumulate in to q8, q9, q12, q13
+        "vpadal.s16 q8, q14\n"
+        "vld1.8 {d8, d9, d10, d11}, [%[rhs_ptr]]!\n"
+        "vpadal.s16 q9, q15\n"
+        "vpadal.s16 q12, q2\n"
+        "vpadal.s16 q13, q3\n"
+
+        // Prefetch the next 64 bytes of LHS and RHS data.
+        RUY_PREFETCH_LOAD("pld [%[lhs_ptr]]\n")
+        RUY_PREFETCH_LOAD("pld [%[rhs_ptr]]\n")
+
+        // Each iteration of this loop advances by 16 levels of depth.
+        "add r1, r1, #16\n"
+
+        // Loop termination condition
+        "cmp r1, r10\n"
+
+        "blt 2b\n"
+
+        "79:\n"
+
+        // Mult, mult-acc in to q14, q15, q2, q3
+        "vmull.s8 q14, d0, d8\n"
+        "vmull.s8 q2, d0, d10\n"
+
+        "vmull.s8 q15, d2, d8\n"
+        "vmull.s8 q3, d2, d10\n"
+
+        "vmlal.s8 q14, d1, d9\n"
+        "vmlal.s8 q2, d1, d11\n"
+        "vmlal.s8 q15, d3, d9\n"
+        "vmlal.s8 q3, d3, d11\n"
+        "vld1.8 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n" // Reload LHS
+
+        // Then pairwise accumulate in to q6, q7, q10, q11
+        "vpadal.s16 q6, q14\n"
+        "vpadal.s16 q7, q15\n"
+        "vpadal.s16 q10, q2\n"
+        "vpadal.s16 q11, q3\n"
+
+        // Mult, mult-acc in to q14, q15, q2, q3
+        "vmull.s8 q14, d0, d8\n"
+        "vmull.s8 q2, d0, d10\n"
+
+        "vmull.s8 q15, d2, d8\n"
+        "vmull.s8 q3, d2, d10\n"
+
+        "vmlal.s8 q14, d1, d9\n"
+        "vmlal.s8 q2, d1, d11\n"
+        "vmlal.s8 q15, d3, d9\n"
+        "vmlal.s8 q3, d3, d11\n"
+
+        // Then pairwise accumulate in to q8, q9, q12, q13
+        "vpadal.s16 q8, q14\n"
+        "vpadal.s16 q9, q15\n"
+        "vpadal.s16 q12, q2\n"
+        "vpadal.s16 q13, q3\n"
+
+
+        // All accumulation over depth done. q6 - q13 contain the 4x32b
+        // accumulators for the 4x2 final matrix.
+        // We now have to compute the final 8-bit values from these int32
+        // accumulators, and advance to the next 4x2 block. We intertwine
+        // these two aspects whenever possible for optimal pipelining, both
+        // at the data flow level (prefetch data for next block as early as
+        // possible) and instruction pipelining level (some of the next-block
+        // work can dual-issue with some of the final work on the current
+        // block).
+
+        // q6-q13 now contain 4 x 32b
+        "vpadd.i32 d0, d12, d13\n"
+        "vpadd.i32 d1, d14, d15\n"
+        "vpadd.i32 d2, d16, d17\n"
+        "vpadd.i32 d3, d18, d19\n"
+        "vpadd.i32 d4, d20, d21\n"
+        "vpadd.i32 d5, d22, d23\n"
+        "vpadd.i32 d6, d24, d25\n"
+        "vpadd.i32 d7, d26, d27\n"
+
+        // d0-d7 each contain 2 x 32b accumulators.
+        // Need to add pairwise to get 1 x 32b for each of the 4x2 entries
+        // of destination, (Four 'd' registers total)
+        "vpadd.i32 d28, d0, d1\n"
+        "vpadd.i32 d29, d2, d3\n"
+        "vpadd.i32 d30, d4, d5\n"
+        "vpadd.i32 d31, d6, d7\n"
+
+        //Now d28 - d31 have the 1 x 32b accumulators for the 4x2 entries
+
+        // Logic to advance to the next block in preparation for the next
+        // iteration of the main loop. For now, we only want to compute
+        // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
+        // not yet ready to update the values of row and col, as we still need
+        // the current values for the rest of the work on the current block.
+
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
+        "ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "cmp r1, r3\n"  // Have we finished the last row?
+
+        "bge 4f\n"           // If finished last row, go to 4
+        // Not finished last row: then advance to next row.
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+        "add r4, r4, r1, lsl #2\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+        "b 5f\n"
+        "4:\n"  // Finished last row...
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
+        // Go back to first row
+        "str r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+
+        // Now we need to advance to the next column. If we already
+        // finished the last column, then in principle we are done, however
+        // we can't just return here, as we need to allow the end work of the
+        // current block to complete. The good news is that at this point it
+        // doesn't matter what data we load for the next column, since
+        // we will exit from the main loop below before actually storing
+        // anything computed from that data.
+
+        "ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "cmp r8, r4\n"  // Have we finished the last column?
+        "bge 5f\n" // If yes, just carry on without updating the column pointer.
+        // Not finished last column: then advance to next column.
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
+        "ldr r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
+        "add r10, r10, r1, lsl #1\n"
+        "str r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
+        "5:\n"
+
+        // Set the LHS and RHS data pointers to the start of the columns just
+        // computed.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+        "mov %[lhs_ptr], r4\n"
+        "ldr r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
+        "mov %[rhs_ptr], r5\n"
+
+        // Now we load: bias data, LHS sums data, RHS sums data.
+
+        // First, load the base pointers from the params.
+        "ldrb r4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
+
+        // Offset these base pointers as needed given the current row, col.
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "add r5, r1, r8, lsl #2\n"
+
+        "tst r4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
+        "it ne\n"
+        "movne r1, r5\n"
+
+        // Load 4 bias values.
+        "vld1.32 {d24, d25}, [r1]\n"
+
+        // Now that we know what LHS and RHS data the next iteration of the
+        // main loop will need to load, we start loading the first 32 bytes of
+        // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
+        // in the rest of the work on the current block.
+        "vld1.8 {d0, d1, d2, d3}, [%[lhs_ptr]]!\n"
+        RUY_PREFETCH_LOAD("pld [%[lhs_ptr]]\n")
+        "vld1.8 {d8, d9, d10, d11}, [%[rhs_ptr]]!\n"
+        RUY_PREFETCH_LOAD("pld [%[rhs_ptr]]\n")
+
+        // Add to the bias values the product
+        // (depth * lhs_zero_point * rhs_zero_point),
+        // See the term NZ1Z2 in equation (7) in
+        // https://arxiv.org/pdf/1712.05877.pdf
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
+        "vdup.32 q9, r3\n"
+        "vadd.i32 q12, q12, q9\n"
+
+        // Perform the bias-addition (per the above, we have just folded into
+        // the bias the (depth * lhs_zero_point * rhs_zero_point) term.)
+        "vadd.i32 q14, q14, q12\n"
+        "vadd.i32 q15, q15, q12\n"
+
+        // LHS/RHS zero points
+        // Has RHS sums
+        "ldrb r6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
+        "tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
+        "beq 401f\n"
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        // Offset by current col * number of bytes per value
+        "add r3, r3, r4, lsl #2\n"
+        "vld1.32 { d12 }, [r3]\n"
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
+        "vdup.32 q10, r5\n"  // create lhs_zero_point_vec
+        // Subtract rhs_sums * lhs_zero_point, per
+        // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
+        "vmls.i32 q14, q10, d12[0]\n"
+        "vmls.i32 q15, q10, d12[1]\n"
+        "401:\n"
+
+        // Has LHS sums
+        "ldrb r6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
+        "tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
+        "beq 402f\n"
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        // Offset by current row * number of bytes per value
+        "add r2, r2, r4, lsl #2\n"
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
+
+        // Load 4 lhs_sums values.
+        "vld1.32 {d22, d23}, [r2]\n"
+        "vdup.32 d13, r5\n" // rhs_zero_point
+
+        // Compute lhs_sums * rhs_zero_point.
+        "vmul.i32 q11, q11, d13[1]\n"
+        // Subtract lhs_sums * rhs_zero_point, per
+        // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
+        "vsub.s32 q14, q14, q11\n"
+        "vsub.s32 q15, q15, q11\n"
+
+        // If the destination is int32, it means the user asks for the raw
+        // accumulators, no need for us to downquantize the value.
+        "ldrb r10, [%[params], #" RUY_STR(RUY_OFFSET_DST_TYPE_ID) "]\n"
+        "cmp r10, #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
+        "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
+
+        "402:\n"
+
+        // At this point we have computed the final int32 values. Now we
+        // start down-quantizing them to obtain the final 8bit values from them.
+
+        // As part of this down-quantization, our int32 values will be
+        // multiplied by a multiplier that has a fixed-point component and an
+        // exponent component.
+
+        //Load the exponent part of the multiplier.
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
+        "tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "add r5, r1, r4, lsl #2\n"
+        "it ne\n"
+        "movne r1, r5\n"
+
+        "vld1.32 {q10}, [r1]\n"
+
+        RUY_MAKE_ZERO(q8)
+        "vmax.s32 q12, q10, q8\n"
+
+        "vshl.s32 q14, q14, q12\n"
+        "vshl.s32 q15, q15, q12\n"
+
+        "vmin.s32 q12, q10, q8\n"
+
+        // Load fixed point part of the multiplier
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
+        // r6 has flags, r4 has row
+        "add r5, r1, r4, lsl #2\n"
+        "tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
+        "it ne\n"
+        "movne r1, r5\n"
+        "vld1.32 {q10}, [r1]\n" // multiplier_fixedpoint
+
+        // Apply the fixed-point part of the multiplier.
+        "vqrdmulh.s32 q14, q14, q10\n"
+        "vqrdmulh.s32 q15, q15, q10\n"
+
+        // We have some rounding division-by-power-of-two to do. This should
+        // always use "round to nearest". We allow for some
+        // freedom in how ties are broken, to strike a good compromise of
+        // performance on given hardware vs. perfect agreement of results
+        // across hardware.
+        //
+        // When RUY_OPT_NATIVE_ROUNDING is enabled, we allow for implementation
+        // defined tie-breaks to help performance. On NEON, this means that we
+        // can just use the NEON rounding instructions, such as srshl. They
+        // happen to be breaking ties upward.
+        //
+        // When RUY_OPT_NATIVE_ROUNDING is disabled, we implement strict
+        // break-ties-away-from zero, as described in Appendix B of
+        // https://arxiv.org/pdf/1712.05877.pdf
+        // When we wrote that, we thought that that would be better unbiased
+        // than the NEON upwards tie-breaks, and we had observed some
+        // improvement on some model. However, that is only more unbiased for
+        // data centered at zero, which was likely the case in that model,
+        // but is not always the case. If we wanted something more consistently
+        // unbiased then we should try breaking ties toward-nearest-even.
+#if !RUY_OPT_ENABLED(RUY_OPT_NATIVE_ROUNDING)
+        // Fix up values to be right-shifted, so that the (round to nearest,
+        // break ties upward) behavior of srshl applied to these fixed-up
+        // values, produces the same result as the desired (round to nearest,
+        // break ties away from zero) behavior on the original values.
+        "vand q8, q14, q12\n"
+        "vand q9, q15, q12\n"
+        "vshr.s32 q8, q8, #31\n"
+        "vshr.s32 q9, q9, #31\n"
+        "vqadd.s32 q14, q14, q8\n"
+        "vqadd.s34 q15, q15, q9\n"
+
+#endif
+        // At this point we have reduced the problem of correctly implementing
+        // rounding divide-by-power-of-two, to what the SRSHL instruction can
+        // do.
+        "vrshl.s32 q14, q14, q12\n"
+        "vrshl.s32 q15, q15, q12\n"
+
+        "ldrb r10, [%[params], #" RUY_STR(RUY_OFFSET_DST_TYPE_ID) "]\n"
+        "cmp r10, #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
+        "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
+        "cmp r10, #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
+        "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
+
+        // Store uint8 values:
+        RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
+
+        // Cast-and-saturate from int32 to int16
+        // After this, all values for output are in q14.
+        "vqmovn.s32 d28, q14\n"
+        "vqmovn.s32 d29, q15\n"
+
+        // At this point, d12 -- d26, d30, d31 aren't used anymore for the
+        // current block, so we can start clearing these accumulators for the
+        // next block (next iteration of the main loop).
+        RUY_MAKE_ZERO(q6)
+        RUY_MAKE_ZERO(q7)
+        RUY_MAKE_ZERO(q8)
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+        RUY_MAKE_ZERO(q12)
+        RUY_MAKE_ZERO(q13)
+        RUY_MAKE_ZERO(q15)
+
+        // Load the destination zero point into each of the 8 16-bit slots
+        // in a q register.
+        "ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
+        "vdup.16 q13, r4\n" // dst_zero_point
+
+        // Add the destination zero point
+        "vadd.i16 q14, q14, q13\n"
+
+        // Cast-and-saturate from int16 to uint8
+        // Now all 8 1-byte values are in d30.
+        "vqmovun.s16 d30, q14\n"
+
+        // Load the clamp_min, clamp_max bounds
+        "ldrb r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
+        "ldrb r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
+        "vdup.8 d28, r2\n"  // clamp_min
+        "vdup.8 d29, r3\n"  // clamp_max
+
+        // Apply the clamp_min bound
+        "vmax.u8 d30, d30, d28\n"
+        // Apply the clamp_max bound
+        "vmin.u8 d30, d30, d29\n"
+
+        // Compute how much of the 4x2 block of destination 8bit values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 4x2, there are some 4x2 blocks along the boundaries that do
+        // not fit entirely.
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "sub r1, r1, r8\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "sub r2, r2, r4\n"
+        "mov r3, #4\n"
+        "mov r5, #2\n"
+        "cmp r1, #4\n"
+        // Compute r1 = how many rows of the 4x2 block fit
+        "it gt\n"
+        "movgt r1, r3\n"
+
+        "cmp r2, #2\n"
+        // Compute r2 = how many cols of the 4x2 block fit
+        "it gt\n"
+        "movgt r2, r5\n"
+
+        // Test if r1==4 && r2 == 2, i.e. if all of the 4x2 block fits.
+        "cmp r1, r3\n"
+        "it eq\n"
+        "cmpeq r2, r5\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        // Yes, all of the 4x2 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 4x2 block fits.
+        // Store to dst_tmp_buf
+        // Set r3 address to write to dst_tmp_buf.
+        "mov r3, %[dst_tmp_buf]\n"
+        "vst1.8 {d30}, [r3]\n"
+
+        // Slow loop copying from dst_tmp_buf to dst.
+        "mov r6, #0\n"
+        "50:\n"
+        "mov r8, #0\n"
+        "51:\n"
+        "ldrb r10, [r3, r8]\n"
+        "strb r10, [r4, r8]\n"
+        "add r8, r8, #1\n"
+        "cmp r8, r1\n"
+        "blt 51b\n"
+        "add r6, r6, #1\n"
+        "add r3, r3, #4\n"
+        "add r4, r4, r5\n"
+        "cmp r6, r2\n"
+        "blt 50b\n"
+        "b 31f\n"
+        "30:\n"
+        // Yes, all of the 4x2 block fits.
+        // r3 address, r5 stride
+        "ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "mov r4, r3\n"
+        "mov r6, #1\n"
+
+        "vst1.32 {d30[0]}, [r3]\n"
+        "add r4, r4, r5\n"
+        "mov r3, r4\n"
+        "vst1.32 {d30[1]}, [r3]\n"
+
+        "31:\n"
+
+        // Load dst_ptr, increment, and write back.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "add r4, r4, #4\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+
+        RUY_MAKE_ZERO(q13)
+        RUY_MAKE_ZERO(q14)
+        RUY_MAKE_ZERO(q15)
+
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        // Store int8 values:
+        RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
+
+        // Cast-and-saturate from int32 to int16
+        // After this, all values for output are in q14.
+        "vqmovn.s32 d28, q14\n"
+        "vqmovn.s32 d29, q15\n"
+
+        // At this point, d12 -- d26, d30, d31 aren't used anymore for the
+        // current block, so we can start clearing these accumulators for the
+        // next block (next iteration of the main loop).
+        RUY_MAKE_ZERO(q6)
+        RUY_MAKE_ZERO(q7)
+        RUY_MAKE_ZERO(q8)
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+        RUY_MAKE_ZERO(q12)
+        RUY_MAKE_ZERO(q13)
+        RUY_MAKE_ZERO(q15)
+
+        // Load the destination zero point into each of the 8 16-bit slots
+        // in a q register.
+        "ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
+        "vdup.16 q13, r4\n" // dst_zero_point
+
+        // Add the destination zero point
+        "vadd.i16 q14, q14, q13\n"
+
+        // Cast-and-saturate from int16 to int8
+        // Now all 8 1-byte values are in d30.
+        "vqmovn.s16 d30, q14\n"
+
+        // Load the clamp_min, clamp_max bounds
+        "ldrb r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
+        "ldrb r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
+        "vdup.8 d28, r2\n"  // clamp_min
+        "vdup.8 d29, r3\n"  // clamp_max
+
+        // Apply the clamp_min bound
+        "vmax.s8 d30, d30, d28\n"
+        // Apply the clamp_max bound
+        "vmin.s8 d30, d30, d29\n"
+
+        // Compute how much of the 4x2 block of destination 8bit values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 4x2, there are some 4x2 blocks along the boundaries that do
+        // not fit entirely.
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "sub r1, r1, r8\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "sub r2, r2, r4\n"
+        "mov r3, #4\n"
+        "mov r5, #2\n"
+        "cmp r1, #4\n"
+        // Compute r1 = how many rows of the 4x2 block fit
+        "it gt\n"
+        "movgt r1, r3\n"
+
+        "cmp r2, #2\n"
+        // Compute r2 = how many cols of the 4x2 block fit
+        "it gt\n"
+        "movgt r2, r5\n"
+
+        // Test if r1==4 && r2 == 2, i.e. if all of the 4x2 block fits.
+        "cmp r1, r3\n"
+        "it eq\n"
+        "cmpeq r2, r5\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        // Yes, all of the 4x2 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 4x2 block fits.
+        // Store to dst_tmp_buf
+        // Set r3 address to write to dst_tmp_buf.
+        "mov r3, %[dst_tmp_buf]\n"
+        "vst1.8 {d30}, [r3]\n"
+
+        // Slow loop copying from dst_tmp_buf to dst.
+        "mov r6, #0\n"
+        "50:\n"
+        "mov r8, #0\n"
+        "51:\n"
+        "ldrb r10, [r3, r8]\n"
+        "strb r10, [r4, r8]\n"
+        "add r8, r8, #1\n"
+        "cmp r8, r1\n"
+        "blt 51b\n"
+        "add r6, r6, #1\n"
+        "add r3, r3, #4\n"
+        "add r4, r4, r5\n"
+        "cmp r6, r2\n"
+        "blt 50b\n"
+        "b 31f\n"
+        "30:\n"
+        // Yes, all of the 4x2 block fits.
+        // r3 address, r5 stride
+        "ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "mov r4, r3\n"
+        "mov r6, #1\n"
+
+        "vst1.32 {d30[0]}, [r3]\n"
+        "add r4, r4, r5\n"
+        "mov r3, r4\n"
+        "vst1.32 {d30[1]}, [r3]\n"
+
+        "31:\n"
+
+        // Load dst_ptr, increment, and write back.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "add r4, r4, #4\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+
+        RUY_MAKE_ZERO(q13)
+        RUY_MAKE_ZERO(q14)
+        RUY_MAKE_ZERO(q15)
+
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
+
+        // Load the destination zero point into each of the 4 32-bit slots
+        // in a q register.
+        "ldrsh r4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
+        "vdup.32 q13, r4\n" // dst_zero_point
+        // Add the destination zero point
+        "vadd.s32 q14, q14, q13\n"
+        "vadd.s32 q15, q15, q13\n"
+
+        // Cast-and-saturate from int32 to int16
+        // After this, all values for output are in q14.
+        "vqmovn.s32 d28, q14\n"
+        "vqmovn.s32 d29, q15\n"
+
+        // At this point, v18 -- v31 aren't used anymore for the current block,
+        // so we can start clearing these accumulators for the next block
+        // (next iteration of the main loop).
+        RUY_MAKE_ZERO(q6)
+        RUY_MAKE_ZERO(q7)
+        RUY_MAKE_ZERO(q8)
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+        RUY_MAKE_ZERO(q15)
+
+         // Load the clamp_min, clamp_max bounds
+        "ldrh r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
+        "ldrh r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
+        "vdup.16 q12, r2\n"  // clamp_min
+        "vdup.16 q13, r3\n"  // clamp_max
+
+        // Apply the clamp_min bound
+        "vmax.s16 q14, q14, q12\n"
+        // Apply the clamp_max bound
+        "vmin.s16 q14, q14, q13\n"
+
+        RUY_MAKE_ZERO(q12)
+        RUY_MAKE_ZERO(q13)
+
+        // Compute how much of the 4x2 block of destination 16-bit values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 4x2, there are some 4x2 blocks along the boundaries that do
+        // not fit entirely.
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "sub r1, r1, r8\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "sub r2, r2, r4\n"
+        "mov r3, #4\n"
+        "mov r5, #2\n"
+        "cmp r1, #4\n"
+        // Compute r1 = how many rows of the 4x2 block fit
+        "it gt\n"
+        "movgt r1, r3\n"
+
+        "cmp r2, #2\n"
+        // Compute r2 = how many cols of the 4x2 block fit
+        "it gt\n"
+        "movgt r2, r5\n"
+
+        // Test if r1==4 && r2 == 2, i.e. if all of the 4x2 block fits.
+        "cmp r1, r3\n"
+        "it eq\n"
+        "cmpeq r2, r5\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        // Yes, all of the 4x2 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 4x2 block fits.
+        // Store to dst_tmp_buf
+        // Set r3 address to write to dst_tmp_buf.
+        "mov r3, %[dst_tmp_buf]\n"
+        "vst1.16 {q14}, [r3]\n"
+
+        // Slow loop copying from dst_tmp_buf to dst.
+        "mov r6, #0\n"
+        "50:\n"
+        "mov r8, #0\n"
+        "51:\n"
+        // Shift of offset register for half-word loads not allowed in A32,
+        // so we shift, load/store, then shift back r8.
+        "lsl r8, r8, #1\n"
+        "ldrh r10, [r3, r8]\n"
+        "strh r10, [r4, r8]\n"
+        "lsr r8, r8, #1\n"
+        "add r8, r8, #1\n"
+        "cmp r8, r1\n"
+        "blt 51b\n"
+        "add r6, r6, #1\n"
+        "add r3, r3, #8\n"
+        "add r4, r4, r5\n"
+        "cmp r6, r2\n"
+        "blt 50b\n"
+        "b 31f\n"
+        "30:\n"
+        // Yes, all of the 4x2 block fits.
+        // r3 address, r5 stride
+        "ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "mov r4, r3\n"
+        "mov r6, #2\n"
+
+        "vst1.16 {d28[0]}, [r3], r6\n"
+        "add r4, r4, r5\n"
+        "vst1.16 {d28[1]}, [r3], r6\n"
+        "vst1.16 {d28[2]}, [r3], r6\n"
+        "vst1.16 {d28[3]}, [r3], r6\n"
+        "mov r3, r4\n"
+        "vst1.16 {d29[0]}, [r3], r6\n"
+        "vst1.16 {d29[1]}, [r3], r6\n"
+        "vst1.16 {d29[2]}, [r3], r6\n"
+        "vst1.16 {d29[3]}, [r3], r6\n"
+        "31:\n"
+
+         // Load dst_ptr, increment, and write back.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "add r4, r4, #8\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+
+        RUY_MAKE_ZERO(q14)
+
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
+
+        // Since the store type is the same as the accum type, no need for
+        // downcast. There's also no need for clamp by min/max.
+
+        // At this point, v20 -- v31 aren't used anymore for the current block,
+        // so we can start clearing these accumulators for the next block
+        // (next iteration of the main loop).
+        // Clear accumulators.
+        RUY_MAKE_ZERO(q6)
+        RUY_MAKE_ZERO(q7)
+        RUY_MAKE_ZERO(q8)
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+        RUY_MAKE_ZERO(q12)
+        RUY_MAKE_ZERO(q13)
+
+        // Compute how much of the 4x2 block of destination 32 bit values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 4x2, there are some 4x4 blocks along the boundaries that do
+        // not fit entirely.
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "sub r1, r1, r8\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "sub r2, r2, r4\n"
+        "mov r3, #4\n"
+        "mov r5, #2\n"
+        "cmp r1, #4\n"
+        // Compute r1 = how many rows of the 4x2 block fit
+        "it gt\n"
+        "movgt r1, r3\n"
+
+        "cmp r2, #2\n"
+        // Compute r2 = how many cols of the 4x2 block fit
+        "it gt\n"
+        "movgt r2, r5\n"
+
+        // Test if r1==4 && r2 == 2, i.e. if all of the 4x2 block fits.
+        "cmp r1, r3\n"
+        "it eq\n"
+        "cmpeq r2, r5\n"
+        // Yes, all of the 4x2 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 4x2 block fits.
+        // Set (r3 address, r4 stride) to write to dst_tmp_buf
+        "mov r3, %[dst_tmp_buf]\n"
+        "mov r4, #16\n"
+        "b 31f\n"
+
+        "30:\n"
+        // Yes, all of the 4x2 block fits.
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        // r3 address, r4 stride
+        "ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "mov r4, r5\n"
+
+        "31:\n"
+
+        "vst1.32 {d28, d29}, [r3]\n"
+        "add r3, r3, r4\n"
+        "vst1.32 {d30, d31}, [r3]\n"
+
+        // If all of the 4x2 block fits, we just finished writing it to the
+        // destination, so we skip the next part.
+        "beq 41f\n"
+        // Not all of the 4x2 block fits in the destination matrix.  We just
+        // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
+        // it to copy into the destination matrix the part that fits.
+        "ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        "mov r3, %[dst_tmp_buf]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "mov r6, #0\n"
+        "50:\n"
+        "mov r5, #0\n"
+        "51:\n"
+        "ldr r10, [r3, r5, lsl #2]\n"
+        "str r10, [r4, r5, lsl #2]\n"
+        "add r5, r5, #1\n"
+        "cmp r5, r1\n"
+        "blt 51b\n"
+        "add r6, r6, #1\n"
+        "add r3, r3, #16\n"
+        "add r4, r4, r8\n"
+        // r2 = how many cols of the 8x4 block fit
+        "cmp r6, r2\n"
+        "blt 50b\n"
+
+        "41:\n"
+        // Load dst_ptr, increment, and write back.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "add r4, r4, #16\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
+
+        // Reload some params --- we had used x5 -- x7 for a few other things
+        // since the last time we had loaded them.
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
+        "ldr r6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
+
+        // Move to the next block of the destination matrix, for the next iter
+        // of the main loop.  Notice that lhs_col_ptr, rhs_col_ptr have already
+        // been updated earlier.
+        // Have we reached the end row?
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "cmp r8, r3\n"
+
+        "beq 20f\n"  // yes, end row.
+        // Not end row. Move to the next row.
+        "add r8, r8, #4\n"
+        // Store new value of row
+        "str r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+
+        "b 21f\n"
+        "20:\n"
+        // Was already at end row.
+        // Move back to first row.
+        "str r6, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        // Move to the next column.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "add r4, r4, #2\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+
+        "ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        "ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
+        // Increment dst_col_ptr by 2 * dst_stride (i.e. 2 columns)
+        "add r1, r1, r8, lsl #1\n"
+        // Store dst_col_ptr
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
+        // Store dst_ptr
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "21:\n"
+
+        // Main loop exit condition: have we hit the end column?
+        "ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "cmp r8, r4\n"
+
+        // w1 is the number of levels of depth that we have already loaded
+        // LHS and RHS data for. Corresponding to the initial ld1 instructions
+        // above, this is currently 16.
+        "mov r1, #16\n"
+
+        "ble 1b\n"
+
+        // Restore stack pointer.
+        "add sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
+
+        // clang-format on
+
+        : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr)
+        : [ params ] "r"(&params), [dst_tmp_buf] "r"(params.dst_tmp_buf)
+        : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r10", "cc",
+           // Clobber list must specify q registers (and not their constituent
+           // d registers). There is a (currently unexplained) slowdown if
+           // d registers are listed in the clobbers list.
+          "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+          "q9", "q10", "q12", "q13", "q14", "q15");
+}
+
+// Fast-int8 true "GEMV" kernel (RHS has 1 column). We assume the RHS
+// is still packed as if it has two columns
+void Kernel8bitNeonOutOfOrder1Col(const KernelParams8bit<4, 2>& params) {
+  profiler::ScopeLabel label(
+      "Kernel (kNeon, optimized for out-of-order cores)");
+
+  CheckOffsetsInKernelParams8bit(params);
+
+  const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
+  const std::int8_t* rhs_col_ptr = params.rhs_base_ptr;
+  const std::int8_t* lhs_ptr = lhs_col_ptr;
+  const std::int8_t* rhs_ptr = rhs_col_ptr;
+
+  // The asm kernel below has the following NEON register allocation:
+  //
+  // q6 - q13 are 128-bit (4x32b) accumulators.
+  // During accumulation, d0 -- d7 are used to load int8 data from LHS and
+  // d8 -- d11 from RHS:
+  //                                            int8 RHS 16x1 block
+  //                                               /------------\
+  //                                               | d8.b[0]    |
+  //                                               | ...        |
+  //                                               | d8.b[7]    |
+  //                                               | d9.b[0]    |
+  //                                               | ...        |
+  //                                               | d9.b[7]    |
+  //                                               \------------/
+  //    int8 LHS 4x16 block
+  //  /-----------------------------------------\  /------------\
+  //  |d0.b[0] ... d0.b[7] d1.b[0] ... d1.b[7]  |  | q6         |
+  //  |d2.b[0] ... d2.b[7] d3.b[0] ... d3.b[7]  |  | q7         |
+  //  |d4.b[0] ... d4.b[7] d5.b[0] ... d5.b[7]  |  | q8         |
+  //  |d6.b[0] ... d6.b[7] d7.b[0] ... d7.b[7]  |  | q9         |
+  //  \-----------------------------------------/  \------------/
+  //                              128-bit accumulators 4x1 block
+  //
+  // No attempt had been made so far at implementing the RUY_OPT_MAX_STREAMING
+  // optimization for this kernel.
+  asm volatile(
+#define RUY_MAKE_ZERO(reg) "vmov.i32 " #reg ", #0x00000000\n"
+
+        // clang-format off
+
+        // Load the first 64 bytes of LHS and RHS data.
+        "vld1.8 {d0, d1}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d2, d3}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d4, d5}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d6, d7}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d8, d9}, [%[rhs_ptr]]!\n"
+        // Skip the other column and advance the pointer.
+        "add %[rhs_ptr], %[rhs_ptr], #16\n"
+
+        "sub sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_BASE_PTR) "]\n"
+        "str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_START_COL) "]\n"
+        "str r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_RHS_BASE_PTR) "]\n"
+        "str r2, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
+
+        // Clear accumulators.
+        RUY_MAKE_ZERO(q6)
+        RUY_MAKE_ZERO(q7)
+        RUY_MAKE_ZERO(q8)
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+        RUY_MAKE_ZERO(q12)
+        RUY_MAKE_ZERO(q13)
+        RUY_MAKE_ZERO(q14)
+        RUY_MAKE_ZERO(q15)
+
+        // r1 is the number of levels of depth that we have already loaded
+        // LHS and RHS data for. Corresponding to the initial ld1 instructions
+        // above, this is currently 16.
+        "mov r1, #16\n"
+
+        // Main loop of the whole GEMM, over rows and columns of the
+        // destination matrix.
+        "1:\n"
+
+        // r1 is how many levels of depth we have already loaded
+        // data for, r10 is the total depth.
+        "ldr r10, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
+        "cmp r1, r10\n"
+        "beq 79f\n"
+
+        "2:\n"
+
+        // Mult, mult-acc in to q14, q15
+        "vmull.s8 q14, d0, d8\n"
+        "vmull.s8 q15, d2, d8\n"
+        "vmlal.s8 q14, d1, d9\n"
+        "vmlal.s8 q15, d3, d9\n"
+
+        // Then pairwise accumulate in to q6, q7
+        "vpadal.s16 q6, q14\n"
+        "vpadal.s16 q7, q15\n"
+
+        // Mult, mult-acc in to q14, q15
+        "vmull.s8 q14, d4, d8\n"
+        "vmull.s8 q15, d6, d8\n"
+        "vmlal.s8 q14, d5, d9\n"
+        "vmlal.s8 q15, d7, d9\n"
+
+        // Then pairwise accumulate in to q8, q9
+        "vpadal.s16 q8, q14\n"
+        "vpadal.s16 q9, q15\n"
+
+
+        // Load the next 64 bytes of LHS and RHS data.
+        "vld1.8 {d0, d1}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d2, d3}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d4, d5}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d6, d7}, [%[lhs_ptr]]!\n"
+        RUY_PREFETCH_LOAD("pld [%[lhs_ptr]]\n")
+        "vld1.8 {d8, d9}, [%[rhs_ptr]]!\n"
+        // Skip the other column and advance the pointer.
+        "add %[rhs_ptr], %[rhs_ptr], #16\n"
+        RUY_PREFETCH_LOAD("pld [%[rhs_ptr]]\n")
+
+        // Each iteration of this loop advances by 16 levels of depth.
+        "add r1, r1, #16\n"
+
+        // Loop termination condition
+        "cmp r1, r10\n"
+
+        "blt 2b\n"
+
+        "79:\n"
+
+        // Mult, mult-acc in to q14, q15
+        "vmull.s8 q14, d0, d8\n"
+        "vmull.s8 q15, d2, d8\n"
+        "vmlal.s8 q14, d1, d9\n"
+        "vmlal.s8 q15, d3, d9\n"
+
+        // Then pairwise accumulate in to q6, q7
+        "vpadal.s16 q6, q14\n"
+        "vpadal.s16 q7, q15\n"
+
+        // Mult, mult-acc in to q14, q15
+        "vmull.s8 q14, d4, d8\n"
+        "vmull.s8 q15, d6, d8\n"
+        "vmlal.s8 q14, d5, d9\n"
+        "vmlal.s8 q15, d7, d9\n"
+
+        // Then pairwise accumulate in to q8, q9
+        "vpadal.s16 q8, q14\n"
+        "vpadal.s16 q9, q15\n"
+
+        // All accumulation over depth done. q6 - q9 contain the 4x32b
+        // accumulators for the 4x1 final matrix. 
+        // We now have to compute the final 8-bit values from these int32
+        // accumulators, and advance to the next 4x2 block. We intertwine
+        // these two aspects whenever possible for optimal pipelining, both
+        // at the data flow level (prefetch data for next block as early as
+        // possible) and instruction pipelining level (some of the next-block
+        // work can dual-issue with some of the final work on the current
+        // block).
+
+        // q6-q9 now contain 4 x 32b
+        "vpadd.i32 d0, d12, d13\n"
+        "vpadd.i32 d1, d14, d15\n"
+        "vpadd.i32 d2, d16, d17\n"
+        "vpadd.i32 d3, d18, d19\n"
+
+        // d0-d4 each contain 2 x 32b accumulators.
+        // Need to add pairwise to get 1 x 32b for each of the 4x1 entries
+        // of destination, (Four 'd' registers total)
+        "vpadd.i32 d28, d0, d1\n"
+        "vpadd.i32 d29, d2, d3\n"
+
+        // Now d28,d29 have the 1 x 32b accumulators for the 4x1 entries.
+
+        // Logic to advance to the next block in preparation for the next
+        // iteration of the main loop. For now, we only want to compute
+        // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
+        // not yet ready to update the values of row and col, as we still need
+        // the current values for the rest of the work on the current block.
+
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
+        "ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "cmp r1, r3\n"  // Have we finished the last row?
+
+        "bge 4f\n"           // If finished last row, go to 4
+        // Not finished last row: then advance to next row.
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+        "add r4, r4, r1, lsl #2\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+        "b 5f\n"
+        "4:\n"  // Finished last row...
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
+        // Go back to first row
+        "str r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+
+        // Now we need to advance to the next column. If we already
+        // finished the last column, then in principle we are done, however
+        // we can't just return here, as we need to allow the end work of the
+        // current block to complete. The good news is that at this point it
+        // doesn't matter what data we load for the next column, since
+        // we will exit from the main loop below before actually storing
+        // anything computed from that data.
+
+        "ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "cmp r8, r4\n"  // Have we finished the last column?
+        "bge 5f\n" // If yes, just carry on without updating the column pointer.
+        // Not finished last column: then advance to next column.
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
+        "ldr r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
+        "add r10, r10, r1, lsl #1\n"
+        "str r10, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
+        "5:\n"
+
+        // Set the LHS and RHS data pointers to the start of the columns just
+        // computed.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_LHS_COL_PTR) "]\n"
+        "mov %[lhs_ptr], r4\n"
+        "ldr r5, [sp, #" RUY_STR(RUY_STACK_OFFSET_RHS_COL_PTR) "]\n"
+        "mov %[rhs_ptr], r5\n"
+
+        // Now we load: bias data, LHS sums data, RHS sums data.
+
+        // First, load the base pointers from the params.
+        "ldrb r4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
+
+        // Offset these base pointers as needed given the current row, col.
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "add r5, r1, r8, lsl #2\n"
+
+        "tst r4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
+        "it ne\n"
+        "movne r1, r5\n"
+
+        // Load 4 bias values.
+        "vld1.32 {d24, d25}, [r1]\n"
+
+        // Now that we know what LHS and RHS data the next iteration of the
+        // main loop will need to load, we start loading the first 32 bytes of
+        // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
+        // in the rest of the work on the current block.
+        "vld1.8 {d0, d1}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d2, d3}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d4, d5}, [%[lhs_ptr]]!\n"
+        "vld1.8 {d6, d7}, [%[lhs_ptr]]!\n"
+        RUY_PREFETCH_LOAD("pld [%[lhs_ptr]]\n")
+        "vld1.8 {d8, d9}, [%[rhs_ptr]]!\n"
+        // Skip the other column and advance the pointer.
+        "add %[rhs_ptr], %[rhs_ptr], #16\n"
+        RUY_PREFETCH_LOAD("pld [%[rhs_ptr]]\n")
+
+        // Add to the bias values the product
+        // (depth * lhs_zero_point * rhs_zero_point),
+        // See the term NZ1Z2 in equation (7) in
+        // https://arxiv.org/pdf/1712.05877.pdf
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
+        "vdup.32 q9, r3\n"
+        "vadd.i32 q12, q12, q9\n"
+
+        // Perform the bias-addition (per the above, we have just folded into
+        // the bias the (depth * lhs_zero_point * rhs_zero_point) term.)
+        "vadd.i32 q14, q14, q12\n"
+
+        // LHS/RHS zero points
+        // Has RHS sums
+        "ldrb r6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
+        "tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
+        "beq 401f\n"
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        // Offset by current col * number of bytes per value
+        "add r3, r3, r4, lsl #2\n"
+        "vld1.32 { d12 }, [r3]\n"
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
+        "vdup.32 q10, r5\n"  // create lhs_zero_point_vec
+        // Subtract rhs_sums * lhs_zero_point, per
+        // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
+        "vmls.i32 q14, q10, d12[0]\n"
+        "401:\n"
+
+        // Has LHS sums
+        "ldrb r6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
+        "tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
+        "beq 402f\n"
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        // Offset by current row * number of bytes per value
+        "add r2, r2, r4, lsl #2\n"
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
+
+        // Load 4 lhs_sums values.
+        "vld1.32 {d22, d23}, [r2]\n"
+        "vdup.32 d13, r5\n" // rhs_zero_point
+
+        // Compute lhs_sums * rhs_zero_point.
+        "vmul.i32 q11, q11, d13[1]\n"
+        // Subtract lhs_sums * rhs_zero_point, per
+        // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
+        "vsub.s32 q14, q14, q11\n"
+
+        // If the destination is int32, it means the user asks for the raw
+        // accumulators, no need for us to downquantize the value.
+        "ldrb r10, [%[params], #" RUY_STR(RUY_OFFSET_DST_TYPE_ID) "]\n"
+        "cmp r10, #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
+        "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
+
+        "402:\n"
+
+        // At this point we have computed the final int32 values. Now we
+        // start down-quantizing them to obtain the final 8bit values from them.
+
+        // As part of this down-quantization, our int32 values will be
+        // multiplied by a multiplier that has a fixed-point component and an
+        // exponent component.
+
+        //Load the exponent part of the multiplier.
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
+        "tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "add r5, r1, r4, lsl #2\n"
+        "it ne\n"
+        "movne r1, r5\n"
+
+        "vld1.32 {q10}, [r1]\n"
+
+        RUY_MAKE_ZERO(q8)
+        "vmax.s32 q12, q10, q8\n"
+
+        "vshl.s32 q14, q14, q12\n"
+
+        "vmin.s32 q12, q10, q8\n"
+
+        // Load fixed point part of the multiplier
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
+        // r6 has flags, r4 has row
+        "add r5, r1, r4, lsl #2\n"
+        "tst r6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
+        "it ne\n"
+        "movne r1, r5\n"
+        "vld1.32 {q10}, [r1]\n" // multiplier_fixedpoint
+
+        // Apply the fixed-point part of the multiplier.
+        "vqrdmulh.s32 q14, q14, q10\n"
+
+        // We have some rounding division-by-power-of-two to do. This should
+        // always use "round to nearest". We allow for some
+        // freedom in how ties are broken, to strike a good compromise of
+        // performance on given hardware vs. perfect agreement of results
+        // across hardware.
+        //
+        // When RUY_OPT_NATIVE_ROUNDING is enabled, we allow for implementation
+        // defined tie-breaks to help performance. On NEON, this means that we
+        // can just use the NEON rounding instructions, such as srshl. They
+        // happen to be breaking ties upward.
+        //
+        // When RUY_OPT_NATIVE_ROUNDING is disabled, we implement strict
+        // break-ties-away-from zero, as described in Appendix B of
+        // https://arxiv.org/pdf/1712.05877.pdf
+        // When we wrote that, we thought that that would be better unbiased
+        // than the NEON upwards tie-breaks, and we had observed some
+        // improvement on some model. However, that is only more unbiased for
+        // data centered at zero, which was likely the case in that model,
+        // but is not always the case. If we wanted something more consistently
+        // unbiased then we should try breaking ties toward-nearest-even.
+#if !RUY_OPT_ENABLED(RUY_OPT_NATIVE_ROUNDING)
+        // Fix up values to be right-shifted, so that the (round to nearest,
+        // break ties upward) behavior of srshl applied to these fixed-up
+        // values, produces the same result as the desired (round to nearest,
+        // break ties away from zero) behavior on the original values.
+        "vand q8, q14, q12\n"
+        "vshr.s32 q8, q8, #31\n"
+        "vqadd.s32 q14, q14, q8\n"
+
+#endif
+        // At this point we have reduced the problem of correctly implementing
+        // rounding divide-by-power-of-two, to what the SRSHL instruction can
+        // do.
+        "vrshl.s32 q14, q14, q12\n"
+
+        "ldrb r10, [%[params], #" RUY_STR(RUY_OFFSET_DST_TYPE_ID) "]\n"
+        "cmp r10, #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
+        "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
+        "cmp r10, #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
+        "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
+
+        // Store uint8 values:
+        RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
+
+        // Cast-and-saturate from int32 to int16
+        // After this, all values for output are in d28.
+        "vqmovn.s32 d28, q14\n"
+
+        // At this point, d12 -- d26, d29, d30, d31 aren't used anymore for the
+        // current block, so we can start clearing these accumulators for the
+        // next block (next iteration of the main loop).
+        RUY_MAKE_ZERO(q6)
+        RUY_MAKE_ZERO(q7)
+        RUY_MAKE_ZERO(q8)
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+        RUY_MAKE_ZERO(q12)
+        RUY_MAKE_ZERO(q13)
+        RUY_MAKE_ZERO(q15)
+
+        // Load the destination zero point into each of the 8 16-bit slots
+        // in a q register.
+        "ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
+        "vdup.16 q13, r4\n" // dst_zero_point
+
+        // Add the destination zero point
+        "vadd.i16 q14, q14, q13\n"
+
+        // Cast-and-saturate from int16 to uint8
+        "vqmovun.s16 d30, q14\n"
+        // At this point, we only need 4 8-bit values in the lower half
+        // of d30.
+
+
+        // Load the clamp_min, clamp_max bounds
+        "ldrb r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
+        "ldrb r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
+        "vdup.8 d28, r2\n"  // clamp_min
+        "vdup.8 d29, r3\n"  // clamp_max
+
+        // Apply the clamp_min bound
+        "vmax.u8 d30, d30, d28\n"
+        // Apply the clamp_max bound
+        "vmin.u8 d30, d30, d29\n"
+
+        // Compute how much of the 4x1 block of destination 8bit values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 4x1, there are some 4x1 blocks along the boundaries that do
+        // not fit entirely.
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "sub r1, r1, r8\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "sub r2, r2, r4\n"
+        "mov r3, #4\n"
+        "mov r5, #2\n"
+        "cmp r1, #4\n"
+        // Compute r1 = how many rows of the 4x1 block fit
+        "it gt\n"
+        "movgt r1, r3\n"
+
+        // Test if r1==4, i.e. if all of the 4x1 block fits.
+        "cmp r1, r3\n"
+
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        // Yes, all of the 4x1 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 4x1 block fits.
+        // Store to dst_tmp_buf
+        // Set r3 address to write to dst_tmp_buf.
+        "mov r3, %[dst_tmp_buf]\n"
+        "vst1.8 {d30}, [r3]\n"
+
+        // Slow loop copying from dst_tmp_buf to dst.
+        "50:\n"
+        "mov r8, #0\n"
+        "51:\n"
+        "ldrb r10, [r3, r8]\n"
+        "strb r10, [r4, r8]\n"
+        "add r8, r8, #1\n"
+        "cmp r8, r1\n"
+        "blt 51b\n"
+        "b 31f\n"
+        "30:\n"
+        // Yes, all of the 4x1 block fits.
+        // r3 address, r5 stride
+        "ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "mov r4, r3\n"
+        "mov r6, #1\n"
+
+        "vst1.8 {d30[0]}, [r3], r6\n"
+        "vst1.8 {d30[1]}, [r3], r6\n"
+        "vst1.8 {d30[2]}, [r3], r6\n"
+        "vst1.8 {d30[3]}, [r3], r6\n"
+        "31:\n"
+
+        // Load dst_ptr, increment, and write back.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "add r4, r4, #4\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+
+        RUY_MAKE_ZERO(q13)
+        RUY_MAKE_ZERO(q14)
+        RUY_MAKE_ZERO(q15)
+
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        // Store int8 values:
+        RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
+
+        // Cast-and-saturate from int32 to int16
+        // After this, all values for output are in d28.
+        "vqmovn.s32 d28, q14\n"
+
+        // At this point, d12 -- d26, d29, d30, d31 aren't used anymore for the
+        // current block, so we can start clearing these accumulators for the
+        // next block (next iteration of the main loop).
+        RUY_MAKE_ZERO(q6)
+        RUY_MAKE_ZERO(q7)
+        RUY_MAKE_ZERO(q8)
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+        RUY_MAKE_ZERO(q12)
+        RUY_MAKE_ZERO(q13)
+        RUY_MAKE_ZERO(q15)
+
+        // Load the destination zero point into each of the 8 16-bit slots
+        // in a q register.
+        "ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
+        "vdup.16 q13, r4\n" // dst_zero_point
+
+        // Add the destination zero point
+        "vadd.i16 q14, q14, q13\n"
+
+        // Cast-and-saturate from int16 to int8
+        "vqmovn.s16 d30, q14\n"
+        // At this point, we only need 4 8-bit values in the lower half
+        // of d30.
+
+        // Load the clamp_min, clamp_max bounds
+        "ldrb r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
+        "ldrb r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
+        "vdup.8 d28, r2\n"  // clamp_min
+        "vdup.8 d29, r3\n"  // clamp_max
+
+        // Apply the clamp_min bound
+        "vmax.s8 d30, d30, d28\n"
+        // Apply the clamp_max bound
+        "vmin.s8 d30, d30, d29\n"
+
+        // Compute how much of the 4x1 block of destination 8bit values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 4x2, there are some 4x2 blocks along the boundaries that do
+        // not fit entirely.
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "sub r1, r1, r8\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "sub r2, r2, r4\n"
+        "mov r3, #4\n"
+        "mov r5, #2\n"
+        "cmp r1, #4\n"
+        // Compute r1 = how many rows of the 4x2 block fit
+        "it gt\n"
+        "movgt r1, r3\n"
+
+        // Test if r1==4 i.e. if all of the 4x1 block fits.
+        "cmp r1, r3\n"
+
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        // Yes, all of the 4x2 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 4x2 block fits.
+        // Store to dst_tmp_buf
+        // Set r3 address to write to dst_tmp_buf.
+        "mov r3, %[dst_tmp_buf]\n"
+        "vst1.8 {d30}, [r3]\n"
+
+        // Slow loop copying from dst_tmp_buf to dst.
+        "50:\n"
+        "mov r8, #0\n"
+        "51:\n"
+        "ldrb r10, [r3, r8]\n"
+        "strb r10, [r4, r8]\n"
+        "add r8, r8, #1\n"
+        "cmp r8, r1\n"
+        "blt 51b\n"
+        "b 31f\n"
+        "30:\n"
+        // Yes, all of the 4x1 block fits.
+        // r3 address, r5 stride
+        "ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "mov r4, r3\n"
+        "mov r6, #1\n"
+
+        "vst1.8 {d30[0]}, [r3], r6\n"
+        "vst1.8 {d30[1]}, [r3], r6\n"
+        "vst1.8 {d30[2]}, [r3], r6\n"
+        "vst1.8 {d30[3]}, [r3], r6\n"
+        "31:\n"
+
+        // Load dst_ptr, increment, and write back.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "add r4, r4, #4\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+
+        RUY_MAKE_ZERO(q13)
+        RUY_MAKE_ZERO(q14)
+        RUY_MAKE_ZERO(q15)
+
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
+
+        // Load the destination zero point into each of the 4 32-bit slots
+        // in a q register.
+        "ldrsh r4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
+        "vdup.32 q13, r4\n" // dst_zero_point
+        // Add the destination zero point
+        "vadd.s32 q14, q14, q13\n"
+        //"vadd.s32 q15, q15, q13\n"
+
+        // Cast-and-saturate from int32 to int16
+        // After this, all values for output are in d28.
+        "vqmovn.s32 d28, q14\n"
+
+        // At this point, d12 -- d26, d29, d30, d31 aren't used anymore for the
+        // so we can start clearing these accumulators for the next block
+        // (next iteration of the main loop).
+        RUY_MAKE_ZERO(q6)
+        RUY_MAKE_ZERO(q7)
+        RUY_MAKE_ZERO(q8)
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+        RUY_MAKE_ZERO(q15)
+
+         // Load the clamp_min, clamp_max bounds
+        "ldrh r2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
+        "ldrh r3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
+        "vdup.16 d24, r2\n"  // clamp_min
+        "vdup.16 d26, r3\n"  // clamp_max
+
+        // Apply the clamp_min bound
+        "vmax.s16 d28, d28, d24\n"
+        // Apply the clamp_max bound
+        "vmin.s16 d28, d28, d26\n"
+
+        RUY_MAKE_ZERO(q12)
+        RUY_MAKE_ZERO(q13)
+
+        // Compute how much of the 4x1 block of destination 16-bit values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 4x1, there are some 4x1 blocks along the boundaries that do
+        // not fit entirely.
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "sub r1, r1, r8\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "sub r2, r2, r4\n"
+        "mov r3, #4\n"
+        "mov r5, #2\n"
+        "cmp r1, #4\n"
+        // Compute r1 = how many rows of the 4x1 block fit
+        "it gt\n"
+        "movgt r1, r3\n"
+
+        // Test if r1==4, i.e. if all of the 4x1 block fits.
+        "cmp r1, r3\n"
+
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        // Yes, all of the 4x1 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 4x1 block fits.
+        // Store to dst_tmp_buf
+        // Set r3 address to write to dst_tmp_buf.
+        "mov r3, %[dst_tmp_buf]\n"
+        "vst1.16 {d28}, [r3]\n"
+
+        // Slow loop copying from dst_tmp_buf to dst.
+        "50:\n"
+        "mov r8, #0\n"
+        "51:\n"
+        // Shift of offset register for half-word loads not allowed in A32,
+        // so we shift, load/store, then shift back r8.
+        "lsl r8, r8, #1\n"
+        "ldrh r10, [r3, r8]\n"
+        "strh r10, [r4, r8]\n"
+        "lsr r8, r8, #1\n"
+        "add r8, r8, #1\n"
+        "cmp r8, r1\n"
+        "blt 51b\n"
+        "b 31f\n"
+        "30:\n"
+        // Yes, all of the 4x1 block fits.
+        // r3 address, r5 stride
+        "ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "mov r4, r3\n"
+        "mov r6, #2\n"
+
+        "vst1.16 {d28[0]}, [r3], r6\n"
+        "vst1.16 {d28[1]}, [r3], r6\n"
+        "vst1.16 {d28[2]}, [r3], r6\n"
+        "vst1.16 {d28[3]}, [r3], r6\n"
+        "31:\n"
+
+         // Load dst_ptr, increment, and write back.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "add r4, r4, #8\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+
+        RUY_MAKE_ZERO(q14)
+
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
+
+        // Since the store type is the same as the accum type, no need for
+        // downcast. There's also no need for clamp by min/max.
+
+        // At this point, v20 -- v31 aren't used anymore for the current block,
+        // so we can start clearing these accumulators for the next block
+        // (next iteration of the main loop).
+        // Clear accumulators.
+        RUY_MAKE_ZERO(q6)
+        RUY_MAKE_ZERO(q7)
+        RUY_MAKE_ZERO(q8)
+        RUY_MAKE_ZERO(q9)
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+        RUY_MAKE_ZERO(q12)
+        RUY_MAKE_ZERO(q13)
+
+        // Compute how much of the 4x1 block of destination 32 bit values that
+        // we have computed, fit in the destination matrix. Typically, all of
+        // it fits, but when the destination matrix shape is not a multiple
+        // of 4x2, there are some 4x4 blocks along the boundaries that do
+        // not fit entirely.
+
+        "ldr r1, [%[params], #" RUY_STR(RUY_OFFSET_DST_ROWS) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "sub r1, r1, r8\n"
+
+        "ldr r2, [%[params], #" RUY_STR(RUY_OFFSET_DST_COLS) "]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "sub r2, r2, r4\n"
+        "mov r3, #4\n"
+        "mov r5, #2\n"
+        "cmp r1, #4\n"
+        // Compute r1 = how many rows of the 4x2 block fit
+        "it gt\n"
+        "movgt r1, r3\n"
+
+        // Test if r1==4, i.e. if all of the 4x1 block fits.
+        "cmp r1, r3\n"
+
+        // Yes, all of the 4x1 block fits, go to fast path.
+        "beq 30f\n"
+        // Not all of the 4x1 block fits.
+        // Set (r3 address, r4 stride) to write to dst_tmp_buf
+        "mov r3, %[dst_tmp_buf]\n"
+        "mov r4, #16\n"
+        "b 31f\n"
+
+        "30:\n"
+        // Yes, all of the 4x1 block fits.
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        // r3 address, r4 stride
+        "ldr r3, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "mov r4, r5\n"
+
+        "31:\n"
+
+        "vst1.32 {d28, d29}, [r3]\n"
+
+        // If all of the 4x1 block fits, we just finished writing it to the
+        // destination, so we skip the next part.
+        "beq 41f\n"
+        // Not all of the 4x1 block fits in the destination matrix.  We just
+        // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
+        // it to copy into the destination matrix the part that fits.
+        "ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        "mov r3, %[dst_tmp_buf]\n"
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "50:\n"
+        "mov r5, #0\n"
+        "51:\n"
+        "ldr r10, [r3, r5, lsl #2]\n"
+        "str r10, [r4, r5, lsl #2]\n"
+        "add r5, r5, #1\n"
+        "cmp r5, r1\n"
+        "blt 51b\n"
+
+        "41:\n"
+        // Load dst_ptr, increment, and write back.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "add r4, r4, #16\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+
+        RUY_MAKE_ZERO(q10)
+        RUY_MAKE_ZERO(q11)
+
+        "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
+
+        RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
+
+        // Reload some params --- we had used x5 -- x7 for a few other things
+        // since the last time we had loaded them.
+        "ldr r5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
+        "ldr r6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
+        "ldr r3, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
+
+        // Move to the next block of the destination matrix, for the next iter
+        // of the main loop.  Notice that lhs_col_ptr, rhs_col_ptr have already
+        // been updated earlier.
+        // Have we reached the end row?
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        "cmp r8, r3\n"
+
+        "beq 20f\n"  // yes, end row.
+        // Not end row. Move to the next row.
+        "add r8, r8, #4\n"
+        // Store new value of row
+        "str r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+
+        "b 21f\n"
+        "20:\n"
+        // Was already at end row.
+        // Move back to first row.
+        "str r6, [sp, #" RUY_STR(RUY_STACK_OFFSET_ROW) "]\n"
+        // Move to the next column.
+        "ldr r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "add r4, r4, #2\n"
+        "str r4, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+
+        "ldr r8, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
+        "ldr r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
+        // Increment dst_col_ptr by dst_stride (i.e. 1 column)
+        "add r1, r1, r8\n"
+        // Store dst_col_ptr
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_COL_PTR) "]\n"
+        // Store dst_ptr
+        "str r1, [sp, #" RUY_STR(RUY_STACK_OFFSET_DST_PTR) "]\n"
+        "21:\n"
+
+        // Main loop exit condition: have we hit the end column?
+        "ldr r4, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
+        "ldr r8, [sp, #" RUY_STR(RUY_STACK_OFFSET_COL) "]\n"
+        "cmp r8, r4\n"
+
+        // w1 is the number of levels of depth that we have already loaded
+        // LHS and RHS data for. Corresponding to the initial ld1 instructions
+        // above, this is currently 16.
+        "mov r1, #16\n"
+
+        "ble 1b\n"
+
+        // Restore stack pointer.
+        "add sp, sp, #" RUY_STR(RUY_STACK_OFFSET_SIZE) "\n"
+
+        // clang-format on
+
+        : [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr)
+        : [ params ] "r"(&params), [dst_tmp_buf] "r"(params.dst_tmp_buf)
+        : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r8", "r10", "cc",
+           // Clobber list must specify q registers (and not their constituent
+           // d registers). There is a (currently unexplained) slowdown if
+           // d registers are listed in the clobbers list.
+          "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+          "q9", "q10", "q12", "q13", "q14", "q15");
+}
+
+#undef RUY_OFFSET_BIAS
+#undef RUY_OFFSET_LHS_SUMS
+#undef RUY_OFFSET_RHS_SUMS
+#undef RUY_OFFSET_LHS_BASE_PTR
+#undef RUY_OFFSET_MULTIPLIER_FIXEDPOINT
+#undef RUY_OFFSET_MULTIPLIER_EXPONENT
+#undef RUY_OFFSET_RHS_BASE_PTR
+#undef RUY_OFFSET_DST_BASE_PTR
+#undef RUY_OFFSET_LHS_ZERO_POINT
+#undef RUY_OFFSET_RHS_ZERO_POINT
+#undef RUY_OFFSET_DST_ZERO_POINT
+#undef RUY_OFFSET_PROD_ZP_DEPTH
+#undef RUY_OFFSET_START_ROW
+#undef RUY_OFFSET_START_COL
+#undef RUY_OFFSET_LAST_ROW
+#undef RUY_OFFSET_LAST_COL
+#undef RUY_OFFSET_DST_ROWS
+#undef RUY_OFFSET_DST_COLS
+#undef RUY_OFFSET_LHS_STRIDE
+#undef RUY_OFFSET_RHS_STRIDE
+#undef RUY_OFFSET_DST_STRIDE
+#undef RUY_OFFSET_DEPTH
+#undef RUY_OFFSET_CLAMP_MIN
+#undef RUY_OFFSET_CLAMP_MAX
+#undef RUY_OFFSET_FLAGS
+#undef RUY_OFFSET_DST_TYPE_ID
+
+#undef RUY_STACK_OFFSET_SIZE
+#undef RUY_STACK_OFFSET_DST_COL_PTR
+#undef RUY_STACK_OFFSET_DST_PTR
+#undef RUY_STACK_OFFSET_ROW
+#undef RUY_STACK_OFFSET_COL
+#undef RUY_STACK_OFFSET_LHS_COL_PTR
+#undef RUY_STACK_OFFSET_RHS_COL_PTR
+
+#endif  // RUY_PLATFORM(NEON_32) && (RUY_OPT_ENABLED(RUY_OPT_ASM)
+}  // namespace ruy