Index: src/x86/itx_sse.asm
--- src/x86/itx_sse.asm.orig
+++ src/x86/itx_sse.asm
@@ -246,6 +246,7 @@ cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, str
     call %%p1
     RET
 %%end:
+    _CET_ENDBR
 %else
     lea                  tx2q, [o(m(i%2_%3_internal_8bpc).pass2)]
 %ifidn %1_%2, dct_dct
@@ -255,6 +256,7 @@ cglobal inv_txfm_add_%1_%2_%3_8bpc, 4, 6, %4, dst, str
     times ((%%end - %%p1) >> 31) & 1 jmp %%p1
 ALIGN function_align
 %%end:
+    _CET_ENDBR
 %endif
 %endif
 %endmacro
@@ -295,6 +297,7 @@ cglobal idct_4x4_internal_8bpc, 0, 0, 0, dst, stride, 
     jmp                tx2q
 
 .pass2:
+    _CET_ENDBR
     IDCT4_1D_PACKED
 
     pxor                 m2, m2
@@ -319,14 +322,17 @@ cglobal iadst_4x4_internal_8bpc, 0, 0, 0, dst, stride,
     jmp                tx2q
 
 .pass2:
+    _CET_ENDBR
     call .main
 
 .end:
+    _CET_ENDBR
     pxor                 m2, m2
     mova      [coeffq+16*0], m2
     mova      [coeffq+16*1], m2
 
 .end2:
+    _CET_ENDBR
     ITX4_END              0, 1, 2, 3
 
 ALIGN function_align
@@ -371,14 +377,17 @@ cglobal iflipadst_4x4_internal_8bpc, 0, 0, 0, dst, str
     jmp                tx2q
 
 .pass2:
+    _CET_ENDBR
     call m(iadst_4x4_internal_8bpc).main
 
 .end:
+    _CET_ENDBR
     pxor                 m2, m2
     mova      [coeffq+16*0], m2
     mova      [coeffq+16*1], m2
 
 .end2:
+    _CET_ENDBR
     ITX4_END              3, 2, 1, 0
 
 INV_TXFM_4X4_FN identity, dct
@@ -401,6 +410,7 @@ cglobal iidentity_4x4_internal_8bpc, 0, 0, 0, dst, str
     jmp                tx2q
 
 .pass2:
+    _CET_ENDBR
     mova                 m3, [o(pw_1697x8)]
     pmulhrsw             m2, m3, m0
     pmulhrsw             m3, m1
@@ -568,10 +578,12 @@ cglobal idct_4x8_internal_8bpc, 0, 0, 0, dst, stride, 
     pmulhrsw             m3,     [coeffq+16*3]
 
 .pass1:
+    _CET_ENDBR
     call m(idct_8x4_internal_8bpc).main
     jmp m(iadst_4x8_internal_8bpc).pass1_end
 
 .pass2:
+    _CET_ENDBR
     call .main
     shufps               m1, m1, q1032
     shufps               m3, m3, q1032
@@ -597,13 +609,16 @@ cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride,
     pmulhrsw             m3,     [coeffq+16*3]
 
 .pass1:
+    _CET_ENDBR
     call m(iadst_8x4_internal_8bpc).main
 
 .pass1_end:
+    _CET_ENDBR
     INV_4X8
     jmp                tx2q
 
 .pass2:
+    _CET_ENDBR
     shufps               m0, m0, q1032
     shufps               m1, m1, q1032
     call .main
@@ -612,9 +627,11 @@ cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride,
     psubw                m5, m4
 
 .end:
+    _CET_ENDBR
     punpcklqdq           m4, m5
 
 .end2:
+    _CET_ENDBR
     pmulhrsw             m0, m4
     pmulhrsw             m1, m4
     pmulhrsw             m2, m4
@@ -626,6 +643,7 @@ cglobal iadst_4x8_internal_8bpc, 0, 0, 0, dst, stride,
     mova      [coeffq+16*3], m5
 
 .end3:
+    _CET_ENDBR
     WRITE_4X8             0, 1, 2, 3
     RET
 
@@ -688,6 +706,7 @@ cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, str
     pmulhrsw             m3,     [coeffq+16*3]
 
 .pass1:
+    _CET_ENDBR
     call m(iadst_8x4_internal_8bpc).main
 
     punpcklwd            m4, m3, m2
@@ -701,6 +720,7 @@ cglobal iflipadst_4x8_internal_8bpc, 0, 0, 0, dst, str
     jmp                tx2q
 
 .pass2:
+    _CET_ENDBR
     shufps               m0, m0, q1032
     shufps               m1, m1, q1032
     call m(iadst_4x8_internal_8bpc).main
@@ -729,6 +749,7 @@ cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, str
     pmulhrsw             m3,     [coeffq+16*3]
 
 .pass1:
+    _CET_ENDBR
     mova                 m7, [o(pw_1697x8)]
     pmulhrsw             m4, m7, m0
     pmulhrsw             m5, m7, m1
@@ -741,6 +762,7 @@ cglobal iidentity_4x8_internal_8bpc, 0, 0, 0, dst, str
     jmp m(iadst_4x8_internal_8bpc).pass1_end
 
 .pass2:
+    _CET_ENDBR
     mova                 m4, [o(pw_4096)]
     jmp m(iadst_4x8_internal_8bpc).end2
 
@@ -822,6 +844,7 @@ cglobal idct_8x4_internal_8bpc, 0, 0, 0, dst, stride, 
     jmp                tx2q
 
 .pass2:
+    _CET_ENDBR
     call .main
     jmp m(iadst_8x4_internal_8bpc).end
 
@@ -865,9 +888,11 @@ cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride,
     jmp              tx2q
 
 .pass2:
+    _CET_ENDBR
     call .main
 
 .end:
+    _CET_ENDBR
     mova                 m4, [o(pw_2048)]
     pmulhrsw             m0, m4
     pmulhrsw             m1, m4
@@ -875,12 +900,14 @@ cglobal iadst_8x4_internal_8bpc, 0, 0, 0, dst, stride,
     pmulhrsw             m3, m4
 
 .end2:
+    _CET_ENDBR
     pxor                 m6, m6
     mova      [coeffq+16*0], m6
     mova      [coeffq+16*1], m6
     mova      [coeffq+16*2], m6
     mova      [coeffq+16*3], m6
 .end3:
+    _CET_ENDBR
     WRITE_8X4             0, 1, 2, 3, 4, 5, 6
     RET
 
@@ -984,6 +1011,7 @@ cglobal iflipadst_8x4_internal_8bpc, 0, 0, 0, dst, str
     jmp                  tx2q
 
 .pass2:
+    _CET_ENDBR
     call m(iadst_8x4_internal_8bpc).main
     mova                 m4, m0
     mova                 m5, m1
@@ -1024,6 +1052,7 @@ cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, str
     jmp                tx2q
 
 .pass2:
+    _CET_ENDBR
     mova                 m7, [o(pw_1697x8)]
     pmulhrsw             m4, m7, m0
     pmulhrsw             m5, m7, m1
@@ -1049,6 +1078,7 @@ cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, str
     pmulhrsw             m0, m1
     pmulhrsw             m0, m2
 .end:
+    _CET_ENDBR
     mov                 r3d, 2
     lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x8_8bpc).end3)]
 .loop:
@@ -1058,6 +1088,7 @@ cglobal iidentity_8x4_internal_8bpc, 0, 0, 0, dst, str
     jg .loop
     jmp                tx2q
 .end3:
+    _CET_ENDBR
     RET
 %endif
 %endmacro
@@ -1104,16 +1135,20 @@ cglobal idct_8x8_internal_8bpc, 0, 0, 0, dst, stride, 
     LOAD_8ROWS          coeffq, 16
 
 .pass1:
+    _CET_ENDBR
     call .main
 
 .pass1_end:
+    _CET_ENDBR
     mova                    m7, [o(pw_16384)]
 
 .pass1_end1:
+    _CET_ENDBR
     REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
     mova    [rsp+gprsize+16*1], m6
 
 .pass1_end2:
+    _CET_ENDBR
     REPX      {pmulhrsw x, m7}, m1, m3, m5
     pmulhrsw                m7, [rsp+gprsize+16*0]
 
@@ -1151,29 +1186,34 @@ cglobal_label .pass1_end3
     jmp                   tx2q
 
 .pass2:
+    _CET_ENDBR
     lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
 
 .pass2_main:
     call .main
 
 .end:
+    _CET_ENDBR
     mova                    m7, [o(pw_2048)]
     REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
     mova    [rsp+gprsize+16*1], m6
 
 .end2:
+    _CET_ENDBR
     REPX      {pmulhrsw x, m7}, m1, m3, m5
     pmulhrsw                m7, [rsp+gprsize+16*0]
     mova    [rsp+gprsize+16*2], m5
     mova    [rsp+gprsize+16*0], m7
 
 .end3:
+    _CET_ENDBR
     WRITE_8X4                0, 1, 2, 3, 5, 6, 7
     lea                   dstq, [dstq+strideq*2]
     WRITE_8X4                4, [rsp+gprsize+16*2], [rsp+gprsize+16*1], [rsp+gprsize+16*0], 5, 6, 7
     jmp                   tx2q
 
 .end4:
+    _CET_ENDBR
     pxor                    m7, m7
     REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
     ret
@@ -1216,13 +1256,16 @@ cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride,
     LOAD_8ROWS          coeffq, 16
 
 .pass1:
+    _CET_ENDBR
     call .main
     call .main_pass1_end
 
 .pass1_end:
+    _CET_ENDBR
     mova                    m7, [o(pw_16384)]
 
 .pass1_end1:
+    _CET_ENDBR
     REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
     mova    [rsp+gprsize+16*1], m6
     pxor                    m6, m6
@@ -1232,6 +1275,7 @@ cglobal iadst_8x8_internal_8bpc, 0, 0, 0, dst, stride,
 
 ALIGN function_align
 .pass2:
+    _CET_ENDBR
     lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
 
 .pass2_main:
@@ -1239,6 +1283,7 @@ ALIGN function_align
     call .main_pass2_end
 
 .end:
+    _CET_ENDBR
     mova                    m7, [o(pw_2048)]
     REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
     mova    [rsp+gprsize+16*1], m6
@@ -1355,13 +1400,16 @@ cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, str
     LOAD_8ROWS          coeffq, 16
 
 .pass1:
+    _CET_ENDBR
     call m(iadst_8x8_internal_8bpc).main
     call m(iadst_8x8_internal_8bpc).main_pass1_end
 
 .pass1_end:
+    _CET_ENDBR
     mova                    m7, [o(pw_m16384)]
 
 .pass1_end1:
+    _CET_ENDBR
     pmulhrsw                m1, m7
     mova    [rsp+gprsize+16*1], m1
     mova                    m1, m6
@@ -1382,6 +1430,7 @@ cglobal iflipadst_8x8_internal_8bpc, 0, 0, 0, dst, str
 
 ALIGN function_align
 .pass2:
+    _CET_ENDBR
     lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
 
 .pass2_main:
@@ -1389,6 +1438,7 @@ ALIGN function_align
     call m(iadst_8x8_internal_8bpc).main_pass2_end
 
 .end:
+    _CET_ENDBR
     mova                    m7, [o(pw_2048)]
     REPX      {pmulhrsw x, m7}, m0, m2, m4, m6
     mova    [rsp+gprsize+16*2], m2
@@ -1419,9 +1469,11 @@ cglobal iidentity_8x8_internal_8bpc, 0, 0, 0, dst, str
 
 ALIGN function_align
 .pass2:
+    _CET_ENDBR
     lea                   tx2q, [o(m(idct_8x8_internal_8bpc).end4)]
 
 .end:
+    _CET_ENDBR
     pmulhrsw                m7, [o(pw_4096)]
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_4096)]
@@ -1443,6 +1495,7 @@ ALIGN function_align
     pmulhrsw              m0, m1
     pmulhrsw              m0, [o(pw_2048)]
 .end:
+    _CET_ENDBR
     WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
     lea                dstq, [dstq+strideq*4]
     WRITE_4X4             0, 0, 1, 2, 3, 0, 1, 2, 3
@@ -1463,6 +1516,7 @@ cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride,
     lea                  r3, [o(m(idct_4x8_internal_8bpc).pass1)]
 
 .pass1:
+    _CET_ENDBR
     mova                 m0, [coeffq+16*1]
     mova                 m1, [coeffq+16*3]
     mova                 m2, [coeffq+16*5]
@@ -1472,6 +1526,7 @@ cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride,
     jmp                  r3
 
 .pass1_2:
+    _CET_ENDBR
     mova      [coeffq+16*1], m0
     mova      [coeffq+16*3], m1
     mova      [coeffq+16*5], m2
@@ -1484,6 +1539,7 @@ cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride,
     jmp                  r3
 
 .pass1_end:
+    _CET_ENDBR
     pop                tx2q
 
     mova                 m4, [coeffq+16*1]
@@ -1497,15 +1553,18 @@ cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride,
     jmp                tx2q
 
 .pass2:
+    _CET_ENDBR
     call m(idct_16x4_internal_8bpc).main
 
 .end:
+    _CET_ENDBR
     mova                  m7, [o(pw_2048)]
     REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
     pmulhrsw              m7, [coeffq+16*7]
     mova       [coeffq+16*4], m4
 
 .end1:
+    _CET_ENDBR
     mova       [coeffq+16*5], m5
     mova       [coeffq+16*6], m6
     mov                   r3, coeffq
@@ -1519,6 +1578,7 @@ cglobal idct_4x16_internal_8bpc, 0, 0, 0, dst, stride,
     WRITE_4X8              0, 1, 3, 2
 
 .end2:
+    _CET_ENDBR
     pxor                  m7, m7
     REPX     {mova [r3+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
     ret
@@ -1533,6 +1593,7 @@ cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_4x16_internal_8bpc).pass1
 
 .pass2:
+    _CET_ENDBR
     call m(iadst_16x4_internal_8bpc).main
     call m(iadst_16x4_internal_8bpc).main_pass2_end
 
@@ -1552,6 +1613,7 @@ cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride
     mova                  m7, [o(pw_2048)]
 
 .end1:
+    _CET_ENDBR
     REPX    {pmulhrsw x, m7}, m0, m5, m4, m6
     pxor                  m3, m3
     psubw                 m3, m7
@@ -1573,6 +1635,7 @@ cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride
     mova                  m3, m4
 
 .end2:
+    _CET_ENDBR
     mova       [coeffq+16*5], m5
     mova       [coeffq+16*6], m6
     mov                   r3, coeffq
@@ -1586,6 +1649,7 @@ cglobal iadst_4x16_internal_8bpc, 0, 0, 0, dst, stride
     WRITE_4X8              0, 1, 2, 3
 
 .end3:
+    _CET_ENDBR
     pxor                  m7, m7
     REPX     {mova [r3+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7
     ret
@@ -1601,6 +1665,7 @@ cglobal iflipadst_4x16_internal_8bpc, 0, 0, 0, dst, st
     jmp   m(idct_4x16_internal_8bpc).pass1
 
 .pass2:
+    _CET_ENDBR
     call m(iadst_16x4_internal_8bpc).main
     call m(iadst_16x4_internal_8bpc).main_pass2_end
 
@@ -1646,6 +1711,7 @@ cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, st
     mov                   r3, tx2q
     lea                 tx2q, [o(.pass1_2)]
 .pass1:
+    _CET_ENDBR
     pmulhrsw              m4, m6, m0
     pmulhrsw              m5, m6, m1
     pavgw                 m4, m0
@@ -1664,6 +1730,7 @@ cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, st
     pandn                 m3, m5
     jmp m(iadst_4x8_internal_8bpc).pass1_end
 .pass1_2:
+    _CET_ENDBR
     mova       [coeffq+16*1], m0
     mova       [coeffq+16*3], m1
     mova       [coeffq+16*5], m2
@@ -1675,11 +1742,13 @@ cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, st
     lea                 tx2q, [o(.pass1_end)]
     jmp .pass1
 .pass1_end:
+    _CET_ENDBR
     mova                  m4, [coeffq+16*1]
     mova                  m5, [coeffq+16*3]
     mova                  m6, [coeffq+16*5]
     jmp                   r3
 .pass2:
+    _CET_ENDBR
     mova                  m7, [o(pw_1697x16)]
     mova       [coeffq+16*6], m6
     REPX    {IDTX16 x, 6, 7}, 0, 1, 2, 3, 4, 5
@@ -1734,6 +1803,7 @@ cglobal iidentity_4x16_internal_8bpc, 0, 0, 0, dst, st
     jg .dconly_loop
     jmp                tx2q
 .end:
+    _CET_ENDBR
     RET
 %endif
 %endmacro
@@ -1801,6 +1871,7 @@ cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride,
     call .main
 
 .pass1_end:
+    _CET_ENDBR
     punpckhwd             m7, m0, m2                 ;packed out1,  out5
     punpcklwd             m0, m2                     ;packed out0,  out4
     punpcklwd             m2, m1, m3                 ;packed out3,  out7
@@ -1813,12 +1884,14 @@ cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride,
     punpckhwd             m5, m7                     ;packed out10, out14
 
 .pass1_end2:
+    _CET_ENDBR
     mova                  m7, [o(pw_16384)]
     REPX    {pmulhrsw x, m7}, m0, m1, m2, m3, m4, m5, m6
     pmulhrsw              m7, [coeffq+16*6]
     mova       [coeffq+16*6], m7
 
 .pass1_end3:
+    _CET_ENDBR
     punpckhwd             m7, m3, m6                 ;packed 9, 11, 13, 15 high
     punpcklwd             m3, m6                     ;packed 9, 10, 13, 15 low
     punpckhwd             m6, m4, m5                 ;packed 8, 10, 12, 14 high
@@ -1840,9 +1913,11 @@ cglobal idct_16x4_internal_8bpc, 0, 0, 0, dst, stride,
     jmp                 tx2q
 
 .pass2:
+    _CET_ENDBR
     lea                 tx2q, [o(m(idct_8x4_internal_8bpc).pass2)]
 
 .pass2_end:
+    _CET_ENDBR
     mova       [coeffq+16*4], m4
     mova       [coeffq+16*5], m5
     mova       [coeffq+16*6], m6
@@ -1922,6 +1997,7 @@ cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride
     mova                  m7, [o(pw_16384)]
 
 .pass1_end:
+    _CET_ENDBR
     REPX    {pmulhrsw x, m7}, m0, m1, m4, m5
     pxor                  m2, m2
     psubw                 m2, m7
@@ -1932,6 +2008,7 @@ cglobal iadst_16x4_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_16x4_internal_8bpc).pass1_end3
 
 .pass2:
+    _CET_ENDBR
     lea                 tx2q, [o(m(iadst_8x4_internal_8bpc).pass2)]
     jmp   m(idct_16x4_internal_8bpc).pass2_end
 
@@ -2107,6 +2184,7 @@ cglobal iflipadst_16x4_internal_8bpc, 0, 0, 0, dst, st
     jmp   m(iadst_16x4_internal_8bpc).pass1_end
 
 .pass2:
+    _CET_ENDBR
     lea                 tx2q, [o(m(iflipadst_8x4_internal_8bpc).pass2)]
     jmp   m(idct_16x4_internal_8bpc).pass2_end
 
@@ -2169,6 +2247,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, st
     jmp   m(idct_16x4_internal_8bpc).pass1_end3
 
 .pass2:
+    _CET_ENDBR
     lea                 tx2q, [o(m(iidentity_8x4_internal_8bpc).pass2)]
     jmp   m(idct_16x4_internal_8bpc).pass2_end
 
@@ -2202,6 +2281,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 0, 0, dst, st
     lea                tx2q, [o(m(inv_txfm_add_dct_dct_8x16_8bpc).end)]
     jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
 .end:
+    _CET_ENDBR
     RET
 %endif
 %endmacro
@@ -2215,18 +2295,21 @@ cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride,
     lea                    r3, [o(m(idct_8x8_internal_8bpc).pass1)]
 
 .pass1:
+    _CET_ENDBR
     LOAD_8ROWS    coeffq+16*1, 32, 1
     mov   [rsp+gprsize+16*11], tx2q
     lea                  tx2q, [o(m(idct_8x16_internal_8bpc).pass1_end)]
     jmp                    r3
 
 .pass1_end:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*1, 32
     LOAD_8ROWS    coeffq+16*0, 32, 1
     mov                  tx2q, [rsp+gprsize+16*11]
     jmp                    r3
 
 .pass2:
+    _CET_ENDBR
     lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end)]
 
 .pass2_pre:
@@ -2261,6 +2344,7 @@ cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride,
     jmp  m(idct_8x8_internal_8bpc).end
 
 .end:
+    _CET_ENDBR
     LOAD_8ROWS   rsp+gprsize+16*3, 16
     mova   [rsp+gprsize+16*0], m7
     lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
@@ -2268,6 +2352,7 @@ cglobal idct_8x16_internal_8bpc, 0, 0, 0, dst, stride,
     jmp  m(idct_8x8_internal_8bpc).end
 
 .end1:
+    _CET_ENDBR
     pxor                   m7, m7
     REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
     ret
@@ -2282,6 +2367,7 @@ cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp  m(idct_8x16_internal_8bpc).pass1
 
 .pass2:
+    _CET_ENDBR
     lea                  tx2q, [o(m(iadst_8x16_internal_8bpc).end)]
 
 .pass2_pre:
@@ -2316,6 +2402,7 @@ cglobal iadst_8x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp m(iadst_8x8_internal_8bpc).end
 
 .end:
+    _CET_ENDBR
     LOAD_8ROWS   rsp+gprsize+16*3, 16
     mova   [rsp+gprsize+16*0], m7
     lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
@@ -2333,6 +2420,7 @@ cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, st
     jmp  m(idct_8x16_internal_8bpc).pass1
 
 .pass2:
+    _CET_ENDBR
     lea                   tx2q, [o(m(iflipadst_8x16_internal_8bpc).end)]
     lea                     r3, [dstq+strideq*8]
 
@@ -2365,6 +2453,7 @@ cglobal iflipadst_8x16_internal_8bpc, 0, 0, 0, dst, st
     jmp  m(iflipadst_8x8_internal_8bpc).end
 
 .end:
+    _CET_ENDBR
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
@@ -2385,6 +2474,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, st
     jmp  m(idct_8x8_internal_8bpc).pass1_end3
 
 .pass1_end:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*1, 32
     LOAD_8ROWS    coeffq+16*0, 32, 1
     mov                  tx2q, r3
@@ -2392,9 +2482,11 @@ cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, st
     jmp  m(idct_8x8_internal_8bpc).pass1_end3
 
 .pass2:
+    _CET_ENDBR
     lea                  tx2q, [o(.end1)]
 
 .end:
+    _CET_ENDBR
     mova   [rsp+gprsize+16*0], m7
     mova   [rsp+gprsize+16*1], m6
     mova                   m7, [o(pw_1697x16)]
@@ -2413,6 +2505,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, st
     jmp  m(idct_8x8_internal_8bpc).end3
 
 .end1:
+    _CET_ENDBR
     LOAD_8ROWS    coeffq+16*1, 32
     lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
     lea                  dstq, [dstq+strideq*2]
@@ -2431,6 +2524,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 0, 0, dst, st
     lea                tx2q, [o(m(inv_txfm_add_dct_dct_16x8_8bpc).end)]
     jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
 .end:
+    _CET_ENDBR
     RET
 %endif
 %endmacro
@@ -2452,6 +2546,7 @@ cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride,
     jmp  m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*1, 32
     LOAD_8ROWS   rsp+gprsize+16*3, 16
     mova   [rsp+gprsize+16*0], m7
@@ -2459,11 +2554,13 @@ cglobal idct_16x8_internal_8bpc, 0, 0, 0, dst, stride,
     jmp  m(idct_8x8_internal_8bpc).pass1_end
 
 .pass2:
+    _CET_ENDBR
     lea                  tx2q, [o(.end)]
     lea                    r3, [dstq+8]
     jmp  m(idct_8x8_internal_8bpc).pass2_main
 
 .end:
+    _CET_ENDBR
     LOAD_8ROWS    coeffq+16*1, 32
     lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
     mov                  dstq, r3
@@ -2591,6 +2688,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride
     jmp m(iadst_8x8_internal_8bpc).pass1_end
 
 .pass1_end:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*1, 32
     LOAD_8ROWS   rsp+gprsize+16*3, 16
     mova   [rsp+gprsize+16*0], m7
@@ -2598,11 +2696,13 @@ cglobal iadst_16x8_internal_8bpc, 0, 0, 0, dst, stride
     jmp m(iadst_8x8_internal_8bpc).pass1_end
 
 .pass2:
+    _CET_ENDBR
     lea                  tx2q, [o(.end)]
     lea                    r3, [dstq+8]
     jmp m(iadst_8x8_internal_8bpc).pass2_main
 
 .end:
+    _CET_ENDBR
     LOAD_8ROWS    coeffq+16*1, 32
     lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
     mov                  dstq, r3
@@ -2876,6 +2976,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, st
     jmp m(iflipadst_8x8_internal_8bpc).pass1_end
 
 .pass1_end:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+16*1, 32
     LOAD_8ROWS     coeffq+16*0, 32
     mova    [rsp+gprsize+16*0], m7
@@ -2883,11 +2984,13 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 0, 0, dst, st
     jmp m(iflipadst_8x8_internal_8bpc).pass1_end
 
 .pass2:
+    _CET_ENDBR
     lea                   tx2q, [o(.end)]
     lea                     r3, [dstq+8]
     jmp m(iflipadst_8x8_internal_8bpc).pass2_main
 
 .end:
+    _CET_ENDBR
     LOAD_8ROWS     coeffq+16*1, 32
     lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
     mov                   dstq, r3
@@ -2909,6 +3012,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, st
     lea                  tx2q, [o(.pass1_end)]
 
 .pass1:
+    _CET_ENDBR
     mova                   m0, [o(pw_2896x8)]
     mova                   m2, [o(pw_1697x16)]
     mova                   m3, [o(pw_16384)]
@@ -2948,6 +3052,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, st
     jmp   m(idct_8x8_internal_8bpc).pass1_end3
 
 .pass1_end:
+    _CET_ENDBR
     mova        [coeffq+16*1], m4
     mova        [coeffq+16*3], m5
     mova        [coeffq+16*5], m6
@@ -2964,11 +3069,13 @@ cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, st
     jmp .pass1
 
 .pass2:
+    _CET_ENDBR
     lea                  tx2q, [o(.end)]
     lea                    r3, [dstq+8]
     jmp  m(iidentity_8x8_internal_8bpc).end
 
 .end:
+    _CET_ENDBR
     LOAD_8ROWS    coeffq+16*1, 32
     lea                  tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
     mov                  dstq, r3
@@ -2986,6 +3093,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 0, 0, dst, st
     lea                  tx2q, [o(m(inv_txfm_add_dct_dct_16x16_8bpc).end)]
     jmp m(inv_txfm_add_dct_dct_16x4_8bpc).dconly
 .end:
+    _CET_ENDBR
     RET
 %endif
 %endmacro
@@ -3007,6 +3115,7 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*17, 32
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
@@ -3015,6 +3124,7 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+16*1, 32
     LOAD_8ROWS     coeffq+16*0, 64
     call  m(idct_8x8_internal_8bpc).main
@@ -3026,6 +3136,7 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end2:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*16, 32
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
@@ -3034,10 +3145,12 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass2:
+    _CET_ENDBR
     lea                   tx2q, [o(.end)]
     jmp  m(idct_8x16_internal_8bpc).pass2_pre
 
 .end:
+    _CET_ENDBR
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     lea                   tx2q, [o(.end1)]
@@ -3046,6 +3159,7 @@ cglobal idct_16x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).end
 
 .end1:
+    _CET_ENDBR
     pxor                    m7, m7
     REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
 
@@ -3133,6 +3247,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, strid
     jmp  m(iadst_8x8_internal_8bpc).pass1_end1
 
 .pass1_end:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*17, 32
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
@@ -3141,6 +3256,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, strid
     jmp  m(iadst_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+16*1, 32
     ITX_16X16_ADST_LOAD_EVEN_COEFS
     call m(iadst_16x8_internal_8bpc).main
@@ -3151,6 +3267,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, strid
     jmp  m(iadst_8x8_internal_8bpc).pass1_end1
 
 .pass1_end2:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*16, 32
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
@@ -3159,10 +3276,12 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, strid
     jmp  m(iadst_8x8_internal_8bpc).pass1_end1
 
 .pass2:
+    _CET_ENDBR
     lea                   tx2q, [o(.end)]
     jmp m(iadst_8x16_internal_8bpc).pass2_pre
 
 .end:
+    _CET_ENDBR
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     lea                   tx2q, [o(.end1)]
@@ -3171,6 +3290,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 0, 0, dst, strid
     jmp  m(iadst_8x8_internal_8bpc).end
 
 .end1:
+    _CET_ENDBR
     pxor                    m7, m7
     REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
 
@@ -3208,6 +3328,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, s
     jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
 
 .pass1_end:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+16*1, 32
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
@@ -3216,6 +3337,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, s
     jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*17, 32
     ITX_16X16_ADST_LOAD_EVEN_COEFS
     call m(iadst_16x8_internal_8bpc).main
@@ -3230,6 +3352,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, s
     jmp  m(iflipadst_8x8_internal_8bpc).pass1_end1
 
 .pass1_end2:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*16, 32
     LOAD_8ROWS    coeffq+16* 0, 32
     mova    [rsp+gprsize+16*0], m7
@@ -3238,11 +3361,13 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, s
     jmp m(iflipadst_8x8_internal_8bpc).pass1_end1
 
 .pass2:
+    _CET_ENDBR
     lea                   tx2q, [o(.end)]
     lea                     r3, [dstq+8]
     jmp m(iflipadst_8x16_internal_8bpc).pass2_pre
 
 .end:
+    _CET_ENDBR
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     lea                   tx2q, [o(.end1)]
@@ -3250,6 +3375,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, s
     jmp  m(iflipadst_8x8_internal_8bpc).end
 
 .end1:
+    _CET_ENDBR
     pxor                    m7, m7
     REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
 
@@ -3273,6 +3399,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 0, 0, dst, s
     jmp m(iflipadst_8x16_internal_8bpc).pass2_main
 
 .end2:
+    _CET_ENDBR
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
@@ -3295,6 +3422,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, s
     lea                   tx2q, [o(.pass1_end)]
 
 .pass1:
+    _CET_ENDBR
     mova                    m6, [o(pw_1697x16)]
     mova                    m7, [coeffq+32*6]
     mova                    m0, [coeffq+32*0]
@@ -3311,28 +3439,33 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, s
     jmp   m(idct_8x8_internal_8bpc).pass1_end3
 
 .pass1_end:
+    _CET_ENDBR
     SAVE_8ROWS          coeffq, 32
     sub                 coeffq, 16
     lea                   tx2q, [o(.pass1_end1)]
     jmp .pass1
 
 .pass1_end1:
+    _CET_ENDBR
     SAVE_8ROWS          coeffq, 32
     sub                 coeffq, 15*16
     lea                   tx2q, [o(.pass1_end2)]
     jmp .pass1
 
 .pass1_end2:
+    _CET_ENDBR
     SAVE_8ROWS          coeffq, 32
     sub                 coeffq, 16
     mov                   tx2q, r3
     jmp .pass1
 
 .pass2:
+    _CET_ENDBR
     lea                     r3, [dstq+8]
     lea                   tx2q, [o(.end1)]
 
 .end:
+    _CET_ENDBR
     mova    [rsp+gprsize+16*0], m7
     mova    [rsp+gprsize+16*1], m4
     mova                    m7, [o(pw_1697x16)]
@@ -3352,12 +3485,14 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, s
     jmp   m(idct_8x8_internal_8bpc).end3
 
 .end1:
+    _CET_ENDBR
     LOAD_8ROWS     coeffq+16*1, 32
     lea                   tx2q, [o(.end2)]
     lea                   dstq, [dstq+strideq*2]
     jmp .end
 
 .end2:
+    _CET_ENDBR
     pxor                    m7, m7
     REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
 
@@ -3368,6 +3503,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 0, 0, dst, s
     jmp .end
 
 .end3:
+    _CET_ENDBR
     LOAD_8ROWS     coeffq+16*1, 32
     lea                   tx2q, [o(m(idct_8x16_internal_8bpc).end1)]
     lea                   dstq, [dstq+strideq*2]
@@ -3399,6 +3535,7 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 6, 8, 16*36
     jmp m(inv_txfm_add_dct_dct_8x8_8bpc).loop
 
 .end:
+    _CET_ENDBR
     RET
 
 
@@ -3414,6 +3551,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride,
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1:
+    _CET_ENDBR
     mova   [rsp+gprsize+16*9 ], m0                        ;in24
     mova   [rsp+gprsize+16*10], m4                        ;in28
     mova   [rsp+gprsize+16*17], m2                        ;in26
@@ -3429,6 +3567,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride,
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_1:
+    _CET_ENDBR
     mova   [rsp+gprsize+16*7 ], m0                        ;in16
     mova   [rsp+gprsize+16*8 ], m4                        ;in20
     mova   [rsp+gprsize+16*15], m2                        ;in18
@@ -3446,6 +3585,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride,
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end:
+    _CET_ENDBR
     mova   [rsp+gprsize+16*5 ], m0                        ;in8
     mova   [rsp+gprsize+16*6 ], m4                        ;in12
     mova   [rsp+gprsize+16*13], m2                        ;in10
@@ -3461,6 +3601,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride,
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
+    _CET_ENDBR
     mova   [rsp+gprsize+16*11], m2                        ;in2
     mova   [rsp+gprsize+16*12], m6                        ;in6
     mova   [rsp+gprsize+16*19], m1                        ;in1
@@ -3505,13 +3646,16 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride,
     call .main
 
 .pass2:
+    _CET_ENDBR
     lea                     r3, [o(.end6)]
 
 .end:
+    _CET_ENDBR
     mova   [rsp+gprsize+16*0 ], m7
     lea                   tx2q, [o(.end2)]
 
 .end1:
+    _CET_ENDBR
     pxor                    m7, m7
     REPX   {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  \
                                      8,  9,  10, 11, 12, 13, 14, 15, \
@@ -3521,10 +3665,12 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride,
     jmp                   tx2q
 
 .end2:
+    _CET_ENDBR
     lea                   tx2q, [o(.end3)]
     jmp   m(idct_8x8_internal_8bpc).end
 
 .end3:
+    _CET_ENDBR
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova   [rsp+gprsize+16*0 ], m7
     lea                   dstq, [dstq+strideq*2]
@@ -3532,6 +3678,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride,
     jmp   m(idct_8x8_internal_8bpc).end
 
 .end4:
+    _CET_ENDBR
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova   [rsp+gprsize+16*0 ], m7
     lea                   dstq, [dstq+strideq*2]
@@ -3539,6 +3686,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride,
     jmp   m(idct_8x8_internal_8bpc).end
 
 .end5:
+    _CET_ENDBR
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova   [rsp+gprsize+16*0 ], m7
     lea                   dstq, [dstq+strideq*2]
@@ -3546,6 +3694,7 @@ cglobal idct_8x32_internal_8bpc, 0, 0, 0, dst, stride,
     jmp   m(idct_8x8_internal_8bpc).end
 
 .end6:
+    _CET_ENDBR
     ret
 
 ALIGN function_align
@@ -3906,6 +4055,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 6, 8, 16*36
     jmp                   tx2q
 
 .end:
+    _CET_ENDBR
     RET
 
 
@@ -3947,21 +4097,25 @@ cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride,
     call m(idct_8x32_internal_8bpc).main
 
 .pass2:
+    _CET_ENDBR
     mova   [rsp+gprsize+16*0 ], m7
     lea                   tx2q, [o(.end)]
     jmp  m(idct_8x32_internal_8bpc).end1
 
 .end:
+    _CET_ENDBR
     mova                    m7, [o(pw_8192)]
     lea                   tx2q, [o(.end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .end1:
+    _CET_ENDBR
     lea                     r3, [dstq+8]
     lea                   tx2q, [o(.end2)]
     jmp   m(idct_8x8_internal_8bpc).pass2_main
 
 .end2:
+    _CET_ENDBR
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova   [rsp+gprsize+16*0 ], m7
     mova                    m7, [o(pw_8192)]
@@ -3969,12 +4123,14 @@ cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride,
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .end3:
+    _CET_ENDBR
     mov                   dstq, r3
     add                     r3, 8
     lea                   tx2q, [o(.end4)]
     jmp   m(idct_8x8_internal_8bpc).pass2_main
 
 .end4:
+    _CET_ENDBR
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova   [rsp+gprsize+16*0 ], m7
     mova                    m7, [o(pw_8192)]
@@ -3982,12 +4138,14 @@ cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride,
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .end5:
+    _CET_ENDBR
     mov                   dstq, r3
     add                     r3, 8
     lea                   tx2q, [o(.end6)]
     jmp   m(idct_8x8_internal_8bpc).pass2_main
 
 .end6:
+    _CET_ENDBR
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova   [rsp+gprsize+16*0 ], m7
     mova                    m7, [o(pw_8192)]
@@ -3995,11 +4153,13 @@ cglobal idct_32x8_internal_8bpc, 0, 0, 0, dst, stride,
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .end7:
+    _CET_ENDBR
     mov                   dstq, r3
     lea                   tx2q, [o(.end8)]
     jmp   m(idct_8x8_internal_8bpc).pass2_main
 
 .end8:
+    _CET_ENDBR
     ret
 
 
@@ -4076,6 +4236,7 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 6, 8, 16*3
     jz .dconly
     call  m(idct_16x32_internal_8bpc)
 .end:
+    _CET_ENDBR
     RET
 
 .dconly:
@@ -4099,6 +4260,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*33, 64               ;in8~in15
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
@@ -4106,6 +4268,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end1:
+    _CET_ENDBR
     mova        [coeffq+16*1 ], m0                        ;in8
     mova        [coeffq+16*5 ], m4                        ;in12
     mova   [rsp+gprsize+16*13], m2                        ;in10
@@ -4123,6 +4286,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end2:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*32, 64               ;in0~in7
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
@@ -4130,6 +4294,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end3:
+    _CET_ENDBR
     mova   [rsp+gprsize+16*11], m2                        ;in2
     mova   [rsp+gprsize+16*12], m6                        ;in6
     mova   [rsp+gprsize+16*19], m1                        ;in1
@@ -4173,6 +4338,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end4:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*34, 64               ;in16~in23
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
@@ -4180,6 +4346,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end5:
+    _CET_ENDBR
     mova        [coeffq+16*2 ], m0                        ;in16
     mova        [coeffq+16*6 ], m4                        ;in20
     mova   [rsp+gprsize+16*15], m2                        ;in18
@@ -4198,6 +4365,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end6:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*35, 64                        ;in24~in31
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
@@ -4205,6 +4373,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end7:
+    _CET_ENDBR
     mova   [rsp+gprsize+16*17], m2                        ;in26
     mova   [rsp+gprsize+16*18], m6                        ;in30
     mova   [rsp+gprsize+16*31], m1                        ;in25
@@ -4230,6 +4399,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride
     call m(idct_8x32_internal_8bpc).main
 
 .pass2:
+    _CET_ENDBR
     mov  [rsp+gprsize*1+16*35], eobd
     lea                     r3, [dstq+8]
     mov  [rsp+gprsize*2+16*35], r3
@@ -4237,6 +4407,7 @@ cglobal idct_16x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp  m(idct_8x32_internal_8bpc).end
 
 .end:
+    _CET_ENDBR
     mov                   dstq, [rsp+gprsize*2+16*35]
     mov                   eobd, [rsp+gprsize*1+16*35]
     add                 coeffq, 16*32
@@ -4377,6 +4548,7 @@ cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride
     add                 coeffq, 16
     lea                     r3, [o(.pass1_end1)]
 .pass1:
+    _CET_ENDBR
     LOAD_8ROWS     coeffq+16*0, 128, 1
     call  m(idct_8x8_internal_8bpc).main
     SAVE_7ROWS    rsp+gprsize+16*3, 16
@@ -4408,11 +4580,13 @@ cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride
     call m(idct_8x32_internal_8bpc).main
 
 .pass1_end:
+    _CET_ENDBR
     mova   [rsp+gprsize+16*0 ], m7
     mov                   tx2q, r3
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end1:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+16*0, 32
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova   [rsp+gprsize+16*0 ], m7
@@ -4420,6 +4594,7 @@ cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end2:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*16, 32
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova   [rsp+gprsize+16*0 ], m7
@@ -4427,6 +4602,7 @@ cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end3:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*32, 32
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova   [rsp+gprsize+16*0 ], m7
@@ -4434,6 +4610,7 @@ cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end4:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+16*48, 32
 
     sub                 coeffq, 16
@@ -4441,6 +4618,7 @@ cglobal idct_32x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp .pass1
 
 .end:
+    _CET_ENDBR
     ret
 
 
@@ -4658,12 +4836,14 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride
     call m(idct_8x32_internal_8bpc).main_fast
 
 .pass1_end:
+    _CET_ENDBR
     mova    [rsp+gprsize+16*0], m7
     mova                    m7, [o(pw_8192)]
     lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+64*0, 64
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova    [rsp+gprsize+16*0], m7
@@ -4672,6 +4852,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end2:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+64*8, 64
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova    [rsp+gprsize+16*0], m7
@@ -4680,6 +4861,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end3:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+64*16, 64
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova    [rsp+gprsize+16*0], m7
@@ -4688,6 +4870,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end4:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+64*24, 64
 
     add                 coeffq, 16
@@ -4696,6 +4879,7 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride
 
 
 .pass2:
+    _CET_ENDBR
     mov                 coeffq, [rsp+gprsize*2+16*35]
     mov                    r3d, 4
     lea                   tx2q, [o(.pass2_end)]
@@ -4794,10 +4978,12 @@ cglobal idct_32x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp                   tx2q
 
 .pass2_end:
+    _CET_ENDBR
     lea                     r3, [o(.pass2_end1)]
     jmp  m(idct_8x32_internal_8bpc).end
 
 .pass2_end1:
+    _CET_ENDBR
     lea                   tx2q, [o(.pass2_end)]
     add                 coeffq, 16*32
     mov                   dstq, [rsp+gprsize*2+16*35]
@@ -4871,6 +5057,7 @@ cglobal inv_txfm_add_dct_dct_16x64_8bpc, 4, 6, 8, 16*6
     jz .dconly
     call m(idct_16x64_internal_8bpc)
 .end:
+    _CET_ENDBR
     RET
 
 .dconly:
@@ -4907,6 +5094,7 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+64*8, 64
     LOAD_8ROWS    rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
@@ -4915,6 +5103,7 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+64*0, 64
 
     add                 coeffq, 16
@@ -5043,12 +5232,14 @@ cglobal idct_16x64_internal_8bpc, 0, 0, 0, dst, stride
     call .main_fast
 
 .end:
+    _CET_ENDBR
     LOAD_8ROWS   rsp+gprsize+16*3, 16
     mova    [rsp+gprsize+16*0], m7
     mov                     r3, r4
     jmp  m(idct_8x32_internal_8bpc).end2
 
 .end1:
+    _CET_ENDBR
     LOAD_8ROWS   rsp+gprsize+16*35, 16
     lea                   dstq, [dstq+strideq*2]
     lea                     r3, [rsp+16*32+gprsize]
@@ -5804,6 +5995,7 @@ cglobal inv_txfm_add_dct_dct_64x16_8bpc, 4, 6, 8, 16*1
     jmp                   tx2q
 
 .end:
+    _CET_ENDBR
     RET
 
 
@@ -5892,6 +6084,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+32*0, 32
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova    [rsp+gprsize+16*0], m7
@@ -5900,6 +6093,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+32*8, 32
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova    [rsp+gprsize+16*0], m7
@@ -5908,6 +6102,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end2:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+32*16, 32
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova    [rsp+gprsize+16*0], m7
@@ -5916,6 +6111,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end3:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+32*24, 32
     LOAD_8ROWS   rsp+gprsize+16*35, 16
     mova    [rsp+gprsize+16*0], m7
@@ -5924,6 +6120,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end4:
+    _CET_ENDBR
     SAVE_8ROWS       dstq+32*0, 32
     LOAD_8ROWS   rsp+gprsize+16*43, 16
     mova    [rsp+gprsize+16*0], m7
@@ -5932,6 +6129,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end5:
+    _CET_ENDBR
     SAVE_8ROWS       dstq+32*8, 32
     LOAD_8ROWS   rsp+gprsize+16*51, 16
     mova    [rsp+gprsize+16*0], m7
@@ -5940,6 +6138,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end6:
+    _CET_ENDBR
     SAVE_8ROWS      dstq+32*16, 32
     LOAD_8ROWS   rsp+gprsize+16*59, 16
     mova    [rsp+gprsize+16*0], m7
@@ -5948,6 +6147,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end7:
+    _CET_ENDBR
     SAVE_8ROWS      dstq+32*24, 32
 
     add                 coeffq, 16
@@ -5956,6 +6156,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride
     jg .pass1_loop
 
 .pass2:
+    _CET_ENDBR
     mov                   dstq, [rsp+gprsize*2+16*67]
     sub                 coeffq, 32
     mov                    r3d, 4
@@ -5977,6 +6178,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp  m(idct_8x8_internal_8bpc).end
 
 .end:
+    _CET_ENDBR
     LOAD_8ROWS   rsp+gprsize+16*3, 16
     mova   [rsp+gprsize+16*0], m7
     lea                  tx2q, [o(.end1)]
@@ -5984,6 +6186,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp  m(idct_8x8_internal_8bpc).end
 
 .end1:
+    _CET_ENDBR
     pxor                   m7, m7
     REPX  {mova [coeffq+16*x], m7}, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15
 
@@ -6014,6 +6217,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp  m(idct_8x8_internal_8bpc).end
 
 .end2:
+    _CET_ENDBR
     LOAD_8ROWS   rsp+gprsize+16*3, 16
     mova   [rsp+gprsize+16*0], m7
     lea                  tx2q, [o(.end3)]
@@ -6021,7 +6225,7 @@ cglobal idct_64x16_internal_8bpc, 0, 0, 0, dst, stride
     jmp  m(idct_8x8_internal_8bpc).end
 
 .end3:
-
+    _CET_ENDBR
     add                 coeffq, 16*16
     mov                    r3d, [rsp+gprsize*1+16*67]
     mov                   dstq, [rsp+gprsize*2+16*67]
@@ -6040,6 +6244,7 @@ cglobal inv_txfm_add_dct_dct_32x64_8bpc, 4, 6, 8, 16*6
     jz .dconly
     call m(idct_32x64_internal_8bpc)
 .end:
+    _CET_ENDBR
     RET
 
 .dconly:
@@ -6120,11 +6325,13 @@ cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride
     call m(idct_8x32_internal_8bpc).main_fast
 
 .pass1_end:
+    _CET_ENDBR
     mova    [rsp+gprsize+16*0], m7
     lea                   tx2q, [o(.pass1_end1)]
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end1:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+64*0, 64
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6132,6 +6339,7 @@ cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end2:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+64*8, 64
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6139,6 +6347,7 @@ cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end3:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+64*16, 64
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6146,6 +6355,7 @@ cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end4:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+64*24, 64
 
     add                 coeffq, 16
@@ -6153,6 +6363,7 @@ cglobal idct_32x64_internal_8bpc, 0, 0, 0, dst, stride
     jg .pass1_loop
 
 .pass2:
+    _CET_ENDBR
     mov                 coeffq, [rsp+gprsize*2+16*67]
     mov                    r3d, 4
     lea                     r4, [dstq+8]
@@ -6169,6 +6380,7 @@ cglobal inv_txfm_add_dct_dct_64x32_8bpc, 4, 6, 8, 16*1
     jz .dconly
     call m(idct_64x32_internal_8bpc)
 .end:
+    _CET_ENDBR
     RET
 
 .dconly:
@@ -6254,6 +6466,7 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+64*0, 64
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6261,6 +6474,7 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end1:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+64*8, 64
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6268,6 +6482,7 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end2:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+64*16, 64
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6275,6 +6490,7 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end3:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+64*24, 64
     LOAD_8ROWS   rsp+gprsize+16*35, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6282,6 +6498,7 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end4:
+    _CET_ENDBR
     SAVE_8ROWS       dstq+64*0, 64
     LOAD_8ROWS   rsp+gprsize+16*43, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6289,6 +6506,7 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end5:
+    _CET_ENDBR
     SAVE_8ROWS       dstq+64*8, 64
     LOAD_8ROWS   rsp+gprsize+16*51, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6296,6 +6514,7 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end6:
+    _CET_ENDBR
     SAVE_8ROWS      dstq+64*16, 64
     LOAD_8ROWS   rsp+gprsize+16*59, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6303,6 +6522,7 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end
 
 .pass1_end7:
+    _CET_ENDBR
     SAVE_8ROWS      dstq+64*24, 64
 
     add                 coeffq, 16
@@ -6311,6 +6531,7 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride
     jg .pass1_loop
 
 .pass2:
+    _CET_ENDBR
     mov                 coeffq, [rsp+gprsize*4+16*67]
     mov                   dstq, [rsp+gprsize*3+16*67]
     mov                   eobd, [rsp+gprsize*1+16*67]
@@ -6321,11 +6542,13 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride
     jmp m(idct_32x32_internal_8bpc).pass2_loop
 
 .pass2_end:
+    _CET_ENDBR
     mova    [rsp+gprsize+16*0], m7
     lea                     r3, [o(.pass2_end1)]
     jmp  m(idct_8x32_internal_8bpc).end2
 
 .pass2_end1:
+    _CET_ENDBR
     lea                   tx2q, [o(.pass2_end)]
     add                 coeffq, 16*32
     mov                   dstq, [rsp+gprsize*2+16*35]
@@ -6334,6 +6557,7 @@ cglobal idct_64x32_internal_8bpc, 0, 0, 0, dst, stride
     jg m(idct_32x32_internal_8bpc).pass2_loop
 
 .pass2_end2:
+    _CET_ENDBR
     mov                   dstq, [rsp+gprsize*3+16*67]
     mov                 coeffq, [rsp+gprsize*2+16*67]
     lea                   tx2q, [o(m(idct_32x32_internal_8bpc).pass2_end)]
@@ -6434,6 +6658,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+64*0, 64
     LOAD_8ROWS   rsp+gprsize+16*11, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6442,6 +6667,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end1:
+    _CET_ENDBR
     SAVE_8ROWS     coeffq+64*8, 64
     LOAD_8ROWS   rsp+gprsize+16*19, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6450,6 +6676,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end2:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+64*16, 64
     LOAD_8ROWS   rsp+gprsize+16*27, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6458,6 +6685,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end3:
+    _CET_ENDBR
     SAVE_8ROWS    coeffq+64*24, 64
     LOAD_8ROWS   rsp+gprsize+16*35, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6466,6 +6694,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end4:
+    _CET_ENDBR
     SAVE_8ROWS       dstq+64*0, 64
     LOAD_8ROWS   rsp+gprsize+16*43, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6474,6 +6703,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end5:
+    _CET_ENDBR
     SAVE_8ROWS       dstq+64*8, 64
     LOAD_8ROWS   rsp+gprsize+16*51, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6482,6 +6712,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end6:
+    _CET_ENDBR
     SAVE_8ROWS      dstq+64*16, 64
     LOAD_8ROWS   rsp+gprsize+16*59, 16
     mova    [rsp+gprsize+16*0], m7
@@ -6490,6 +6721,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride
     jmp   m(idct_8x8_internal_8bpc).pass1_end1
 
 .pass1_end7:
+    _CET_ENDBR
     SAVE_8ROWS      dstq+64*24, 64
 
     add                 coeffq, 16
@@ -6498,6 +6730,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride
     jg .pass1_loop
 
 .pass2:
+    _CET_ENDBR
     mov                   dstq, [rsp+gprsize*3+16*67]
     mov                 coeffq, [rsp+gprsize*2+16*67]
     lea                   dstq, [dstq+32]
@@ -6508,6 +6741,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride
     jmp m(idct_16x64_internal_8bpc).pass2_loop
 
 .pass2_end:
+    _CET_ENDBR
     LOAD_8ROWS   rsp+gprsize+16*35, 16
     lea                   dstq, [dstq+strideq*2]
     lea                     r3, [rsp+16*32+gprsize]
@@ -6523,6 +6757,7 @@ cglobal idct_64x64_internal_8bpc, 0, 0, 0, dst, stride
     jg  m(idct_16x64_internal_8bpc).pass2_loop
 
 .pass2_end2:
+    _CET_ENDBR
     mov                 coeffq, [rsp+gprsize*4+16*67]
     mov                   dstq, [rsp+gprsize*2+16*67]
     mov                    r3d, 4
