x86: Remove redundant labels and undefs in SSSE3 itx asm

author: Henrik Gramner <gramner@twoorioles.com> 2022-02-28 02:41:15 +0300
committer: Henrik Gramner <henrik@gramner.com> 2022-02-28 02:43:30 +0300
commit: cf00849987f052cac3b3e147b8909a7a1348b527 (patch)
tree: b70020539efc95727537593e0a84538c52a00743
parent: 9124c54b346421231c3b1ea716dc928d678eb558 (diff)
1 files changed, 126 insertions, 160 deletions
diff --git a/src/x86/itx_sse.asm b/src/x86/itx_sse.asm
index bad443f..2bf3821 100644
--- a/src/x86/itx_sse.asm
+++ b/src/x86/itx_sse.asm
@@ -2380,7 +2380,7 @@ INV_TXFM_8X16_FN identity, identity
 cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS    coeffq+16*1, 32, 1
     mov                    r3, tx2q
-    lea                  tx2q, [o(m(iidentity_8x16_internal_8bpc).pass1_end)]
+    lea                  tx2q, [o(.pass1_end)]
     mova   [rsp+gprsize+16*1], m6
     jmp  m(idct_8x8_internal_8bpc).pass1_end3
 
@@ -2392,7 +2392,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp  m(idct_8x8_internal_8bpc).pass1_end3
 
 .pass2:
-    lea                  tx2q, [o(m(iidentity_8x16_internal_8bpc).end1)]
+    lea                  tx2q, [o(.end1)]
 
 .end:
     mova   [rsp+gprsize+16*0], m7
@@ -2448,7 +2448,7 @@ cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS    coeffq+16*1, 32, 1
     call  .main
     mov                    r3, tx2q
-    lea                  tx2q, [o(m(idct_16x8_internal_8bpc).pass1_end)]
+    lea                  tx2q, [o(.pass1_end)]
     jmp  m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end:
@@ -2459,7 +2459,7 @@ cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp  m(idct_8x8_internal_8bpc).pass1_end
 
 .pass2:
-    lea                  tx2q, [o(m(idct_16x8_internal_8bpc).end)]
+    lea                  tx2q, [o(.end)]
     lea                    r3, [dstq+8]
     jmp  m(idct_8x8_internal_8bpc).pass2_main
 
@@ -2587,7 +2587,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     call .main
     call .main_pass1_end
     mov                    r3, tx2q
-    lea                  tx2q, [o(m(iadst_16x8_internal_8bpc).pass1_end)]
+    lea                  tx2q, [o(.pass1_end)]
     jmp m(iadst_8x8_internal_8bpc).pass1_end
 
 .pass1_end:
@@ -2598,7 +2598,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp m(iadst_8x8_internal_8bpc).pass1_end
 
 .pass2:
-    lea                  tx2q, [o(m(iadst_16x8_internal_8bpc).end)]
+    lea                  tx2q, [o(.end)]
     lea                    r3, [dstq+8]
     jmp m(iadst_8x8_internal_8bpc).pass2_main
 
@@ -2872,7 +2872,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     mov                     r3, tx2q
-    lea                   tx2q, [o(m(iflipadst_16x8_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     jmp m(iflipadst_8x8_internal_8bpc).pass1_end
 
 .pass1_end:
@@ -2883,7 +2883,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp m(iflipadst_8x8_internal_8bpc).pass1_end
 
 .pass2:
-    lea                   tx2q, [o(m(iflipadst_16x8_internal_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     lea                     r3, [dstq+8]
     jmp m(iflipadst_8x8_internal_8bpc).pass2_main
 
@@ -2906,7 +2906,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova                   m6, [coeffq-16*3]
     mova                   m7, [coeffq-16*1]
     mov                    r3, tx2q
-    lea                  tx2q, [o(m(iidentity_16x8_internal_8bpc).pass1_end)]
+    lea                  tx2q, [o(.pass1_end)]
 
 .pass1:
     mova                   m0, [o(pw_2896x8)]
@@ -2964,7 +2964,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp .pass1
 
 .pass2:
-    lea                  tx2q, [o(m(iidentity_16x8_internal_8bpc).end)]
+    lea                  tx2q, [o(.end)]
     lea                    r3, [dstq+8]
     jmp  m(iidentity_8x8_internal_8bpc).end
 
@@ -3002,7 +3002,7 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS     coeffq+16*3, 64
     call m(idct_16x8_internal_8bpc).main
     mov                     r3, tx2q
-    lea                   tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     mova                    m7, [o(pw_8192)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
@@ -3010,7 +3010,7 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_8ROWS    coeffq+16*17, 32
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     mova                    m7, [o(pw_8192)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
@@ -3021,7 +3021,7 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_7ROWS    rsp+gprsize+16*3, 16
     LOAD_8ROWS     coeffq+16*2, 64
     call m(idct_16x8_internal_8bpc).main
-    lea                   tx2q, [o(m(idct_16x16_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     mova                    m7, [o(pw_8192)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
@@ -3034,13 +3034,13 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass2:
-    lea                   tx2q, [o(m(idct_16x16_internal_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     jmp  m(idct_8x16_internal_8bpc).pass2_pre
 
 .end:
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_16x16_internal_8bpc).end1)]
+    lea                   tx2q, [o(.end1)]
     mov                   dstq, r3
     lea                     r3, [dstq+8]
     jmp   m(idct_8x8_internal_8bpc).end
@@ -3128,7 +3128,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     call m(iadst_16x8_internal_8bpc).main_pass1_end
 
     mov                     r3, tx2q
-    lea                   tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     mova                    m7, [o(pw_8192)]
     jmp  m(iadst_8x8_internal_8bpc).pass1_end1
 
@@ -3136,7 +3136,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_8ROWS    coeffq+16*17, 32
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     mova                    m7, [o(pw_8192)]
     jmp  m(iadst_8x8_internal_8bpc).pass1_end1
 
@@ -3146,7 +3146,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     call m(iadst_16x8_internal_8bpc).main
     call m(iadst_16x8_internal_8bpc).main_pass1_end
 
-    lea                   tx2q, [o(m(iadst_16x16_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     mova                    m7, [o(pw_8192)]
     jmp  m(iadst_8x8_internal_8bpc).pass1_end1
 
@@ -3159,13 +3159,13 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp  m(iadst_8x8_internal_8bpc).pass1_end1
 
 .pass2:
-    lea                   tx2q, [o(m(iadst_16x16_internal_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     jmp m(iadst_8x16_internal_8bpc).pass2_pre
 
 .end:
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(iadst_16x16_internal_8bpc).end1)]
+    lea                   tx2q, [o(.end1)]
     mov                   dstq, r3
     lea                     r3, [dstq+8]
     jmp  m(iadst_8x8_internal_8bpc).end
@@ -3203,7 +3203,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     call m(iadst_16x8_internal_8bpc).main_pass1_end
 
     mov                     r3, tx2q
-    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     mova                    m7, [o(pw_m8192)]
     jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
 
@@ -3211,7 +3211,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_8ROWS     coeffq+16*1, 32
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     mova                    m7, [o(pw_m8192)]
     jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
 
@@ -3225,7 +3225,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_8ROWS     coeffq+16*0, 32
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     mova                    m7, [o(pw_m8192)]
     jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
 
@@ -3238,14 +3238,14 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
 
 .pass2:
-    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     lea                     r3, [dstq+8]
     jmp m(iflipadst_8x16_internal_8bpc).pass2_pre
 
 .end:
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).end1)]
+    lea                   tx2q, [o(.end1)]
     lea                   dstq, [dstq+strideq*2]
     jmp  m(iflipadst_8x8_internal_8bpc).end
 
@@ -3268,7 +3268,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova    [rsp+gprsize+16*5], m6
     mova    [rsp+gprsize+16*6], m7
 
-    lea                   tx2q, [o(m(iflipadst_16x16_internal_8bpc).end2)]
+    lea                   tx2q, [o(.end2)]
     mov                   dstq, r3
     jmp m(iflipadst_8x16_internal_8bpc).pass2_main
 
@@ -3292,7 +3292,7 @@ INV_TXFM_16X16_FN identity, identity
 cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     add                 coeffq, 16*17
     mov                     r3, tx2q
-    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
 
 .pass1:
     mova                    m6, [o(pw_1697x16)]
@@ -3313,13 +3313,13 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 .pass1_end:
     SAVE_8ROWS          coeffq, 32
     sub                 coeffq, 16
-    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp .pass1
 
 .pass1_end1:
     SAVE_8ROWS          coeffq, 32
     sub                 coeffq, 15*16
-    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     jmp .pass1
 
 .pass1_end2:
@@ -3330,7 +3330,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 
 .pass2:
     lea                     r3, [dstq+8]
-    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).end1)]
+    lea                   tx2q, [o(.end1)]
 
 .end:
     mova    [rsp+gprsize+16*0], m7
@@ -3353,7 +3353,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 
 .end1:
     LOAD_8ROWS     coeffq+16*1, 32
-    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).end2)]
+    lea                   tx2q, [o(.end2)]
     lea                   dstq, [dstq+strideq*2]
     jmp .end
 
@@ -3363,7 +3363,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 
     add                 coeffq, 32*8
     LOAD_8ROWS          coeffq, 32
-    lea                   tx2q, [o(m(iidentity_16x16_internal_8bpc).end3)]
+    lea                   tx2q, [o(.end3)]
     mov                   dstq, r3
     jmp .end
 
@@ -3395,7 +3395,7 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob,
     pshuflw              m0, m0, q0000
     punpcklwd            m0, m0
     mov                 r3d, 8
-    lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x32_8bpc).end)]
+    lea                tx2q, [o(.end)]
     jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
 
 .end:
@@ -3404,14 +3404,13 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob,
 
 
 cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
     cmp                   eobd, 106
     jle .fast
 
     LOAD_8ROWS     coeffq+16*3, 64
     call  m(idct_8x8_internal_8bpc).main
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).pass1)]
+    lea                   tx2q, [o(.pass1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1:
@@ -3426,7 +3425,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS     coeffq+16*2, 64
     call  m(idct_8x8_internal_8bpc).main
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).pass1_1)]
+    lea                   tx2q, [o(.pass1_1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_1:
@@ -3443,7 +3442,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS     coeffq+16*1, 64
     call  m(idct_8x8_internal_8bpc).main
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end:
@@ -3458,7 +3457,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS     coeffq+16*0, 64
     call  m(idct_8x8_internal_8bpc).main
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
@@ -3506,11 +3505,11 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     call .main
 
 .pass2:
-    lea                     r3, [o(m(idct_8x32_internal_8bpc).end6)]
+    lea                     r3, [o(.end6)]
 
 .end:
     mova   [rsp+gprsize+16*0 ], m7
-    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).end2)]
+    lea                   tx2q, [o(.end2)]
 
 .end1:
     pxor                    m7, m7
@@ -3522,21 +3521,21 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp                   tx2q
 
 .end2:
-    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).end3)]
+    lea                   tx2q, [o(.end3)]
     jmp   m(idct_8x8_internal_8bpc).end
 
 .end3:
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova   [rsp+gprsize+16*0 ], m7
     lea                   dstq, [dstq+strideq*2]
-    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).end4)]
+    lea                   tx2q, [o(.end4)]
     jmp   m(idct_8x8_internal_8bpc).end
 
 .end4:
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova   [rsp+gprsize+16*0 ], m7
     lea                   dstq, [dstq+strideq*2]
-    lea                   tx2q, [o(m(idct_8x32_internal_8bpc).end5)]
+    lea                   tx2q, [o(.end5)]
     jmp   m(idct_8x8_internal_8bpc).end
 
 .end5:
@@ -3875,7 +3874,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob,
     movd                    m2, [o(pw_8192)]
     mov               [coeffq], eobd
     mov                    r3d, 8
-    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x8_8bpc).end)]
+    lea                   tx2q, [o(.end)]
 
 .body:
     pmulhrsw                m0, m2
@@ -3911,7 +3910,6 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob,
 
 
 cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
     LOAD_8ROWS     coeffq+16*0, 64
     call  m(idct_8x8_internal_8bpc).main
     SAVE_7ROWS    rsp+gprsize+16*3, 16
@@ -3950,55 +3948,55 @@ cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 
 .pass2:
     mova   [rsp+gprsize+16*0 ], m7
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     jmp  m(idct_8x32_internal_8bpc).end1
 
 .end:
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end1)]
+    lea                   tx2q, [o(.end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .end1:
     lea                     r3, [dstq+8]
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end2)]
+    lea                   tx2q, [o(.end2)]
     jmp   m(idct_8x8_internal_8bpc).pass2_main
 
 .end2:
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova   [rsp+gprsize+16*0 ], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end3)]
+    lea                   tx2q, [o(.end3)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .end3:
     mov                   dstq, r3
     add                     r3, 8
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end4)]
+    lea                   tx2q, [o(.end4)]
     jmp   m(idct_8x8_internal_8bpc).pass2_main
 
 .end4:
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova   [rsp+gprsize+16*0 ], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end5)]
+    lea                   tx2q, [o(.end5)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .end5:
     mov                   dstq, r3
     add                     r3, 8
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end6)]
+    lea                   tx2q, [o(.end6)]
     jmp   m(idct_8x8_internal_8bpc).pass2_main
 
 .end6:
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova   [rsp+gprsize+16*0 ], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end7)]
+    lea                   tx2q, [o(.end7)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .end7:
     mov                   dstq, r3
-    lea                   tx2q, [o(m(idct_32x8_internal_8bpc).end8)]
+    lea                   tx2q, [o(.end8)]
     jmp   m(idct_8x8_internal_8bpc).pass2_main
 
 .end8:
@@ -4077,6 +4075,7 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob
     test                  eobd, eobd
     jz .dconly
     call  m(idct_16x32_internal_8bpc)
+.end:
     RET
 
 .dconly:
@@ -4086,28 +4085,24 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob
     mov               [coeffq], eobd
     pmulhrsw                m0, m1
     mov                    r2d, 16
-    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_16x32_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
 
-.end:
-    RET
 
 cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     LOAD_8ROWS     coeffq+16*1, 128, 1
     call  m(idct_8x8_internal_8bpc).main
     SAVE_7ROWS    rsp+gprsize+16*3, 16
     LOAD_8ROWS     coeffq+16*5, 128, 1
     call m(idct_16x8_internal_8bpc).main
-    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end:
     SAVE_8ROWS    coeffq+16*33, 64               ;in8~in15
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end1:
@@ -4124,14 +4119,14 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_7ROWS    rsp+gprsize+16*3, 16
     LOAD_8ROWS     coeffq+16*4, 128, 1
     call m(idct_16x8_internal_8bpc).main
-    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end2:
     SAVE_8ROWS    coeffq+16*32, 64               ;in0~in7
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end3)]
+    lea                   tx2q, [o(.pass1_end3)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end3:
@@ -4174,14 +4169,14 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_7ROWS    rsp+gprsize+16*3, 16
     LOAD_8ROWS     coeffq+16*6, 128, 1
     call m(idct_16x8_internal_8bpc).main
-    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end4)]
+    lea                   tx2q, [o(.pass1_end4)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end4:
     SAVE_8ROWS    coeffq+16*34, 64               ;in16~in23
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end5)]
+    lea                   tx2q, [o(.pass1_end5)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end5:
@@ -4199,14 +4194,14 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_7ROWS    rsp+gprsize+16*3, 16
     LOAD_8ROWS     coeffq+16*7, 128, 1
     call m(idct_16x8_internal_8bpc).main
-    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end6)]
+    lea                   tx2q, [o(.pass1_end6)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end6:
     SAVE_8ROWS    coeffq+16*35, 64                        ;in24~in31
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_16x32_internal_8bpc).pass1_end7)]
+    lea                   tx2q, [o(.pass1_end7)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end7:
@@ -4238,7 +4233,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     mov  [rsp+gprsize*1+16*35], eobd
     lea                     r3, [dstq+8]
     mov  [rsp+gprsize*2+16*35], r3
-    lea                     r3, [o(m(idct_16x32_internal_8bpc).end)]
+    lea                     r3, [o(.end)]
     jmp  m(idct_8x32_internal_8bpc).end
 
 .end:
@@ -4288,7 +4283,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_8ROWS   rsp+gprsize+16*11, 16
 
     call m(idct_8x32_internal_8bpc).main_fast
-    jmp  .end1
+    jmp m(idct_8x32_internal_8bpc).pass2
 
 .full1:
     mova                    m4, [coeffq+16*2 ]            ;in16
@@ -4329,12 +4324,9 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     mova   [rsp+gprsize+16*34], m7                        ;in31
 
     call m(idct_8x32_internal_8bpc).main
-
-.end1:
     jmp m(idct_8x32_internal_8bpc).pass2
 
 
-
 cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob, tx2
 %if ARCH_X86_32
     LEA                     r5, $$
@@ -4382,10 +4374,8 @@ cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob
 
 
 cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     add                 coeffq, 16
-    lea                     r3, [o(m(idct_32x16_internal_8bpc).pass1_end1)]
+    lea                     r3, [o(.pass1_end1)]
 .pass1:
     LOAD_8ROWS     coeffq+16*0, 128, 1
     call  m(idct_8x8_internal_8bpc).main
@@ -4426,28 +4416,28 @@ cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     SAVE_8ROWS     coeffq+16*0, 32
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova   [rsp+gprsize+16*0 ], m7
-    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end2:
     SAVE_8ROWS    coeffq+16*16, 32
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova   [rsp+gprsize+16*0 ], m7
-    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end3)]
+    lea                   tx2q, [o(.pass1_end3)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end3:
     SAVE_8ROWS    coeffq+16*32, 32
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova   [rsp+gprsize+16*0 ], m7
-    lea                   tx2q, [o(m(idct_32x16_internal_8bpc).pass1_end4)]
+    lea                   tx2q, [o(.pass1_end4)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end4:
     SAVE_8ROWS    coeffq+16*48, 32
 
     sub                 coeffq, 16
-    lea                     r3, [o(m(idct_32x16_internal_8bpc).end)]
+    lea                     r3, [o(.end)]
     jmp .pass1
 
 .end:
@@ -4455,8 +4445,6 @@ cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 
 
 cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     mov                    r4d, eobd
     cmp                   eobd, 43                ;if (eob > 43)
     sbb                    r3d, r3d               ;  iteration_count++
@@ -4520,8 +4508,6 @@ cglobal inv_txfm_add_identity_identity_16x32_8bpc, 4, 6, 8, 16*4, dst, stride, c
 
 
 cglobal inv_txfm_add_identity_identity_32x16_8bpc, 4, 6, 8, 16*4, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     mov                    r4d, 12                ;0100b
     mov                    r5d, 136               ;1000 1000b
     cmp                   eobd, 44                ;if (eob > 43)
@@ -4600,8 +4586,6 @@ cglobal inv_txfm_add_dct_dct_32x32_8bpc, 4, 6, 8, 16*36, dst, stride, coeff, eob
 
 
 cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     mov                    r4d, 2
     sub                   eobd, 136
     mov  [rsp+gprsize*1+16*35], eobd
@@ -4676,7 +4660,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 .pass1_end:
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
@@ -4684,7 +4668,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end2:
@@ -4692,7 +4676,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end3)]
+    lea                   tx2q, [o(.pass1_end3)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end3:
@@ -4700,7 +4684,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass1_end4)]
+    lea                   tx2q, [o(.pass1_end4)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end4:
@@ -4714,7 +4698,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 .pass2:
     mov                 coeffq, [rsp+gprsize*2+16*35]
     mov                    r3d, 4
-    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)]
+    lea                   tx2q, [o(.pass2_end)]
 
 .pass2_loop:
     mov  [rsp+gprsize*3+16*35], r3d
@@ -4810,11 +4794,11 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     jmp                   tx2q
 
 .pass2_end:
-    lea                     r3, [o(m(idct_32x32_internal_8bpc).pass2_end1)]
+    lea                     r3, [o(.pass2_end1)]
     jmp  m(idct_8x32_internal_8bpc).end
 
 .pass2_end1:
-    lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)]
+    lea                   tx2q, [o(.pass2_end)]
     add                 coeffq, 16*32
     mov                   dstq, [rsp+gprsize*2+16*35]
     mov                    r3d, [rsp+gprsize*3+16*35]
@@ -4825,8 +4809,6 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 
 
 cglobal inv_txfm_add_identity_identity_32x32_8bpc, 4, 6, 8, 16*5, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     mov                    r4d, 2
     cmp                   eobd, 136
     mov                    r3d, 4
@@ -4887,8 +4869,8 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob
 %endif
     test                  eobd, eobd
     jz .dconly
-
     call m(idct_16x64_internal_8bpc)
+.end:
     RET
 
 .dconly:
@@ -4897,16 +4879,11 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob
     movd                    m2, [o(pw_8192)]
     mov               [coeffq], eobd
     mov                    r2d, 32
-    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_16x64_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
 
-.end:
-    RET
-
 
 cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     mov                    r4d, 2
     sub                   eobd, 151
     mov  [rsp+gprsize*1+16*67], eobd
@@ -4926,7 +4903,7 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS     coeffq+64*1, 64*2
     call m(idct_16x8_internal_8bpc).main
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end:
@@ -4934,7 +4911,7 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_16x64_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
@@ -4948,7 +4925,7 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     mov                    r3d, 2
     lea                     r4, [dstq+8]
     mov  [rsp+gprsize*2+16*67], r4
-    lea                     r4, [o(m(idct_16x64_internal_8bpc).end1)]
+    lea                     r4, [o(.end1)]
 
 .pass2_loop:
     mov  [rsp+gprsize*3+16*67], r3d
@@ -5075,7 +5052,7 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*35, 16
     lea                   dstq, [dstq+strideq*2]
     add                    rsp, 16*32
-    lea                     r3, [o(m(idct_16x64_internal_8bpc).end2)]
+    lea                     r3, [o(.end2)]
     jmp  m(idct_8x32_internal_8bpc).end
 
 .end2:
@@ -5086,7 +5063,7 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     mov                    r3d, [rsp+gprsize*3+16*67]
     lea                     r4, [dstq+8]
     mov  [rsp+gprsize*2+16*67], r4
-    lea                     r4, [o(m(idct_16x64_internal_8bpc).end1)]
+    lea                     r4, [o(.end1)]
 
     dec                    r3d
     jg .pass2_loop
@@ -5757,7 +5734,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*132, dst, stride, coeff, eo
     movd                    m2, [o(pw_8192)]
     mov               [coeffq], eobd
     mov                    r3d, 16
-    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_64x16_8bpc).end)]
+    lea                   tx2q, [o(.end)]
 
 .body:
     pmulhrsw                m0, m2
@@ -5887,7 +5864,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end:
@@ -5895,7 +5872,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
@@ -5903,7 +5880,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end2:
@@ -5911,7 +5888,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end3)]
+    lea                   tx2q, [o(.pass1_end3)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end3:
@@ -5919,7 +5896,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*35, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end4)]
+    lea                   tx2q, [o(.pass1_end4)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end4:
@@ -5927,7 +5904,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*43, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end5)]
+    lea                   tx2q, [o(.pass1_end5)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end5:
@@ -5935,7 +5912,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*51, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end6)]
+    lea                   tx2q, [o(.pass1_end6)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end6:
@@ -5943,7 +5920,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*59, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x16_internal_8bpc).pass1_end7)]
+    lea                   tx2q, [o(.pass1_end7)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end7:
@@ -5971,14 +5948,14 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     call m(idct_16x8_internal_8bpc).main
 
     mov                    r3, dstq
-    lea                  tx2q, [o(m(idct_64x16_internal_8bpc).end)]
+    lea                  tx2q, [o(.end)]
     lea                  dstq, [dstq+strideq*8]
     jmp  m(idct_8x8_internal_8bpc).end
 
 .end:
     LOAD_8ROWS   rsp+gprsize+16*3, 16
     mova   [rsp+gprsize+16*0], m7
-    lea                  tx2q, [o(m(idct_64x16_internal_8bpc).end1)]
+    lea                  tx2q, [o(.end1)]
     mov                  dstq, r3
     jmp  m(idct_8x8_internal_8bpc).end
 
@@ -6008,14 +5985,14 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     call m(idct_16x8_internal_8bpc).main
 
     mov                    r3, dstq
-    lea                  tx2q, [o(m(idct_64x16_internal_8bpc).end2)]
+    lea                  tx2q, [o(.end2)]
     lea                  dstq, [dstq+strideq*8]
     jmp  m(idct_8x8_internal_8bpc).end
 
 .end2:
     LOAD_8ROWS   rsp+gprsize+16*3, 16
     mova   [rsp+gprsize+16*0], m7
-    lea                  tx2q, [o(m(idct_64x16_internal_8bpc).end3)]
+    lea                  tx2q, [o(.end3)]
     mov                  dstq, r3
     jmp  m(idct_8x8_internal_8bpc).end
 
@@ -6037,8 +6014,8 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob
 %endif
     test                  eobd, eobd
     jz .dconly
-
     call m(idct_32x64_internal_8bpc)
+.end:
     RET
 
 .dconly:
@@ -6048,16 +6025,11 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*68, dst, stride, coeff, eob
     mov               [coeffq], eobd
     pmulhrsw                m0, m1
     mov                    r3d, 64
-    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_32x64_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     jmp m(inv_txfm_add_dct_dct_32x8_8bpc).body
 
-.end:
-    RET
-
 
 cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     mov                    r4d, 2
     sub                   eobd, 136
     mov  [rsp+gprsize*1+16*67], eobd
@@ -6125,28 +6097,28 @@ cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 
 .pass1_end:
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end1:
     SAVE_8ROWS     coeffq+64*0, 64
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end2:
     SAVE_8ROWS     coeffq+64*8, 64
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end3)]
+    lea                   tx2q, [o(.pass1_end3)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end3:
     SAVE_8ROWS    coeffq+64*16, 64
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_32x64_internal_8bpc).pass1_end4)]
+    lea                   tx2q, [o(.pass1_end4)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end4:
@@ -6171,8 +6143,8 @@ cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eo
 %endif
     test                  eobd, eobd
     jz .dconly
-
     call m(idct_64x32_internal_8bpc)
+.end:
     RET
 
 .dconly:
@@ -6182,15 +6154,11 @@ cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eo
     pmulhrsw                m0, m1
     mov               [coeffq], eobd
     mov                    r3d, 32
-    lea                   tx2q, [o(m(inv_txfm_add_dct_dct_64x32_8bpc).end)]
+    lea                   tx2q, [o(.end)]
     jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
 
-.end:
-    RET
 
 cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     mov                    r4d, 2
     sub                   eobd, 136
     mov  [rsp+gprsize*1+16*67], eobd
@@ -6258,56 +6226,56 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
 
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end:
     SAVE_8ROWS     coeffq+64*0, 64
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end1:
     SAVE_8ROWS     coeffq+64*8, 64
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end2:
     SAVE_8ROWS    coeffq+64*16, 64
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end3)]
+    lea                   tx2q, [o(.pass1_end3)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end3:
     SAVE_8ROWS    coeffq+64*24, 64
     LOAD_8ROWS   rsp+gprsize+16*35, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end4)]
+    lea                   tx2q, [o(.pass1_end4)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end4:
     SAVE_8ROWS       dstq+64*0, 64
     LOAD_8ROWS   rsp+gprsize+16*43, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end5)]
+    lea                   tx2q, [o(.pass1_end5)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end5:
     SAVE_8ROWS       dstq+64*8, 64
     LOAD_8ROWS   rsp+gprsize+16*51, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end6)]
+    lea                   tx2q, [o(.pass1_end6)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end6:
     SAVE_8ROWS      dstq+64*16, 64
     LOAD_8ROWS   rsp+gprsize+16*59, 16
     mova    [rsp+gprsize+16*0], m7
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass1_end7)]
+    lea                   tx2q, [o(.pass1_end7)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end7:
@@ -6324,17 +6292,17 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     mov                   eobd, [rsp+gprsize*1+16*67]
     lea                   dstq, [dstq+32]
     mov  [rsp+gprsize*1+16*35], eobd
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)]
+    lea                   tx2q, [o(.pass2_end)]
     mov                    r3d, 4
     jmp m(idct_32x32_internal_8bpc).pass2_loop
 
 .pass2_end:
     mova    [rsp+gprsize+16*0], m7
-    lea                     r3, [o(m(idct_64x32_internal_8bpc).pass2_end1)]
+    lea                     r3, [o(.pass2_end1)]
     jmp  m(idct_8x32_internal_8bpc).end2
 
 .pass2_end1:
-    lea                   tx2q, [o(m(idct_64x32_internal_8bpc).pass2_end)]
+    lea                   tx2q, [o(.pass2_end)]
     add                 coeffq, 16*32
     mov                   dstq, [rsp+gprsize*2+16*35]
     mov                    r3d, [rsp+gprsize*3+16*35]
@@ -6369,8 +6337,6 @@ cglobal inv_txfm_add_dct_dct_64x64_8bpc, 4, 6, 8, 16*197, dst, stride, coeff, eo
     jmp m(inv_txfm_add_dct_dct_64x16_8bpc).body
 
 cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
-    %undef cmp
-
     mov                    r5d, 4
     mov                    r4d, 2
     sub                   eobd, 136
@@ -6440,7 +6406,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end)]
+    lea                   tx2q, [o(.pass1_end)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end:
@@ -6448,7 +6414,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end1)]
+    lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
@@ -6456,7 +6422,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end2)]
+    lea                   tx2q, [o(.pass1_end2)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end2:
@@ -6464,7 +6430,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end3)]
+    lea                   tx2q, [o(.pass1_end3)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end3:
@@ -6472,7 +6438,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*35, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end4)]
+    lea                   tx2q, [o(.pass1_end4)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end4:
@@ -6480,7 +6446,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*43, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end5)]
+    lea                   tx2q, [o(.pass1_end5)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end5:
@@ -6488,7 +6454,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*51, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end6)]
+    lea                   tx2q, [o(.pass1_end6)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end6:
@@ -6496,7 +6462,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     LOAD_8ROWS   rsp+gprsize+16*59, 16
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
-    lea                   tx2q, [o(m(idct_64x64_internal_8bpc).pass1_end7)]
+    lea                   tx2q, [o(.pass1_end7)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end7:
@@ -6514,7 +6480,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     mov                    r3d, 4
     lea                     r4, [dstq+8]
     mov  [rsp+gprsize*2+16*67], r4
-    lea                     r4, [o(m(idct_64x64_internal_8bpc).pass2_end)]
+    lea                     r4, [o(.pass2_end)]
     jmp m(idct_16x64_internal_8bpc).pass2_loop
 
 .pass2_end:
@@ -6522,7 +6488,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     lea                   dstq, [dstq+strideq*2]
     add                    rsp, 16*32
     mova    [rsp+gprsize+16*0], m7
-    lea                     r3, [o(m(idct_64x64_internal_8bpc).pass2_end1)]
+    lea                     r3, [o(.pass2_end1)]
     jmp  m(idct_8x32_internal_8bpc).end2
 
 .pass2_end1:
@@ -6533,7 +6499,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride, coeff, eob, tx2
     mov                    r3d, [rsp+gprsize*3+16*67]
     lea                     r4, [dstq+8]
     mov  [rsp+gprsize*2+16*67], r4
-    lea                     r4, [o(m(idct_64x64_internal_8bpc).pass2_end)]
+    lea                     r4, [o(.pass2_end)]
 
     dec                    r3d
     jg  m(idct_16x64_internal_8bpc).pass2_loop
author	Henrik Gramner <gramner@twoorioles.com>	2022-02-28 02:41:15 +0300
committer	Henrik Gramner <henrik@gramner.com>	2022-02-28 02:43:30 +0300
commit	cf00849987f052cac3b3e147b8909a7a1348b527 (patch)
tree	b70020539efc95727537593e0a84538c52a00743
parent	9124c54b346421231c3b1ea716dc928d678eb558 (diff)