Index: src/x86/mc16_avx2.asm
--- src/x86/mc16_avx2.asm.orig
+++ src/x86/mc16_avx2.asm
@@ -222,10 +222,12 @@ cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w,
     test               mxyd, mxyd
     jnz .v
 .put:
+    _CET_ENDBR
     movzx                wd, word [r7+wq*2+table_offset(put,)]
     add                  wq, r7
     jmp                  wq
 .put_w2:
+    _CET_ENDBR
     mov                 r6d, [srcq+ssq*0]
     mov                 r7d, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -236,6 +238,7 @@ cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w,
     jg .put_w2
     RET
 .put_w4:
+    _CET_ENDBR
     mov                  r6, [srcq+ssq*0]
     mov                  r7, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -246,6 +249,7 @@ cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w,
     jg .put_w4
     RET
 .put_w8:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0]
     movu                 m1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -257,6 +261,7 @@ cglobal put_bilin_16bpc, 4, 8, 0, dst, ds, src, ss, w,
     RET
 INIT_YMM avx2
 .put_w16:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0]
     movu                 m1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -267,6 +272,7 @@ INIT_YMM avx2
     jg .put_w16
     RET
 .put_w32:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0+32*0]
     movu                 m1, [srcq+ssq*0+32*1]
     movu                 m2, [srcq+ssq*1+32*0]
@@ -281,6 +287,7 @@ INIT_YMM avx2
     jg .put_w32
     RET
 .put_w64:
+    _CET_ENDBR
     movu                 m0, [srcq+32*0]
     movu                 m1, [srcq+32*1]
     movu                 m2, [srcq+32*2]
@@ -295,6 +302,7 @@ INIT_YMM avx2
     jg .put_w64
     RET
 .put_w128:
+    _CET_ENDBR
     movu                 m0, [srcq+32*0]
     movu                 m1, [srcq+32*1]
     movu                 m2, [srcq+32*2]
@@ -317,6 +325,7 @@ INIT_YMM avx2
     jg .put_w128
     RET
 .h:
+    _CET_ENDBR
     movd                xm5, mxyd
     mov                mxyd, r7m ; my
     vpbroadcastd         m4, [pw_16]
@@ -332,6 +341,7 @@ INIT_YMM avx2
     vpbroadcastd         m3, [r7-put_avx2+put_bilin_h_rnd+r6*4]
     jmp                  wq
 .h_w2:
+    _CET_ENDBR
     movq                xm1, [srcq+ssq*0]
     movhps              xm1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -348,6 +358,7 @@ INIT_YMM avx2
     jg .h_w2
     RET
 .h_w4:
+    _CET_ENDBR
     movq                xm0, [srcq+ssq*0]
     movhps              xm0, [srcq+ssq*1]
     movq                xm1, [srcq+ssq*0+2]
@@ -365,6 +376,7 @@ INIT_YMM avx2
     jg .h_w4
     RET
 .h_w8:
+    _CET_ENDBR
     movu                xm0, [srcq+ssq*0]
     vinserti128          m0, [srcq+ssq*1], 1
     movu                xm1, [srcq+ssq*0+2]
@@ -382,6 +394,7 @@ INIT_YMM avx2
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     pmullw               m0, m4, [srcq+ssq*0]
     pmullw               m1, m5, [srcq+ssq*0+2]
     paddw                m0, m3
@@ -400,6 +413,7 @@ INIT_YMM avx2
     jg .h_w16
     RET
 .h_w32:
+    _CET_ENDBR
     pmullw               m0, m4, [srcq+32*0]
     pmullw               m1, m5, [srcq+32*0+2]
     paddw                m0, m3
@@ -419,6 +433,7 @@ INIT_YMM avx2
     RET
 .h_w64:
 .h_w128:
+    _CET_ENDBR
     movifnidn           t0d, org_w
 .h_w64_loop0:
     mov                 r6d, t0d
@@ -443,6 +458,7 @@ INIT_YMM avx2
     jg .h_w64_loop0
     RET
 .v:
+    _CET_ENDBR
     movzx                wd, word [r7+wq*2+table_offset(put, _bilin_v)]
     shl                mxyd, 11
     movd                xm5, mxyd
@@ -450,6 +466,7 @@ INIT_YMM avx2
     vpbroadcastw         m5, xm5
     jmp                  wq
 .v_w2:
+    _CET_ENDBR
     movd                xm0, [srcq+ssq*0]
 .v_w2_loop:
     movd                xm1, [srcq+ssq*1]
@@ -467,6 +484,7 @@ INIT_YMM avx2
     jg .v_w2_loop
     RET
 .v_w4:
+    _CET_ENDBR
     movq                xm0, [srcq+ssq*0]
 .v_w4_loop:
     movq                xm1, [srcq+ssq*1]
@@ -484,6 +502,7 @@ INIT_YMM avx2
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     movu                xm0, [srcq+ssq*0]
 .v_w8_loop:
     vbroadcasti128       m1, [srcq+ssq*1]
@@ -501,6 +520,7 @@ INIT_YMM avx2
     jg .v_w8_loop
     RET
 .v_w32:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0+32*0]
     movu                 m1, [srcq+ssq*0+32*1]
 .v_w32_loop:
@@ -532,6 +552,7 @@ INIT_YMM avx2
 .v_w16:
 .v_w64:
 .v_w128:
+    _CET_ENDBR
     movifnidn           t0d, org_w
     add                 t0d, t0d
     mov                  r4, srcq
@@ -563,6 +584,7 @@ INIT_YMM avx2
     jg .v_w16_loop0
     RET
 .hv:
+    _CET_ENDBR
     movzx                wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
     WIN64_SPILL_XMM       8
     shl                mxyd, 11
@@ -579,6 +601,7 @@ INIT_YMM avx2
 .hv_12bpc:
     jmp                  wq
 .hv_w2:
+    _CET_ENDBR
     vpbroadcastq        xm1, [srcq+ssq*0]
     pmullw              xm0, xm4, xm1
     psrlq               xm1, 16
@@ -610,6 +633,7 @@ INIT_YMM avx2
     jg .hv_w2_loop
     RET
 .hv_w4:
+    _CET_ENDBR
     pmullw              xm0, xm4, [srcq+ssq*0-8]
     pmullw              xm1, xm5, [srcq+ssq*0-6]
     paddw               xm0, xm3
@@ -640,6 +664,7 @@ INIT_YMM avx2
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     pmullw              xm0, xm4, [srcq+ssq*0]
     pmullw              xm1, xm5, [srcq+ssq*0+2]
     paddw               xm0, xm3
@@ -674,6 +699,7 @@ INIT_YMM avx2
 .hv_w32:
 .hv_w64:
 .hv_w128:
+    _CET_ENDBR
 %if UNIX64
     lea                 r6d, [r8*2-32]
 %else
@@ -744,6 +770,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
     test               mxyd, mxyd
     jnz .v
 .prep:
+    _CET_ENDBR
     movzx                wd, word [r6+wq*2+table_offset(prep,)]
     mov                 r5d, r7m ; bitdepth_max
     vpbroadcastd         m5, [r6-prep_avx2+pw_8192]
@@ -753,6 +780,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
     lea            stride3q, [strideq*3]
     jmp                  wq
 .prep_w4:
+    _CET_ENDBR
     movq                xm0, [srcq+strideq*0]
     movhps              xm0, [srcq+strideq*1]
     vpbroadcastq         m1, [srcq+strideq*2]
@@ -768,6 +796,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
     jg .prep_w4
     RET
 .prep_w8:
+    _CET_ENDBR
     movu                xm0, [srcq+strideq*0]
     vinserti128          m0, [srcq+strideq*1], 1
     movu                xm1, [srcq+strideq*2]
@@ -784,6 +813,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
     jg .prep_w8
     RET
 .prep_w16:
+    _CET_ENDBR
     pmullw               m0, m4, [srcq+strideq*0]
     pmullw               m1, m4, [srcq+strideq*1]
     pmullw               m2, m4, [srcq+strideq*2]
@@ -802,6 +832,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
     jg .prep_w16
     RET
 .prep_w32:
+    _CET_ENDBR
     pmullw               m0, m4, [srcq+strideq*0+32*0]
     pmullw               m1, m4, [srcq+strideq*0+32*1]
     pmullw               m2, m4, [srcq+strideq*1+32*0]
@@ -820,6 +851,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
     jg .prep_w32
     RET
 .prep_w64:
+    _CET_ENDBR
     pmullw               m0, m4, [srcq+32*0]
     pmullw               m1, m4, [srcq+32*1]
     pmullw               m2, m4, [srcq+32*2]
@@ -838,6 +870,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
     jg .prep_w64
     RET
 .prep_w128:
+    _CET_ENDBR
     pmullw               m0, m4, [srcq+32*0]
     pmullw               m1, m4, [srcq+32*1]
     pmullw               m2, m4, [srcq+32*2]
@@ -868,6 +901,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
     jg .prep_w128
     RET
 .h:
+    _CET_ENDBR
     movd                xm5, mxyd
     mov                mxyd, r6m ; my
     vpbroadcastd         m4, [pw_16]
@@ -886,6 +920,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
     lea            stride3q, [strideq*3]
     jmp                  wq
 .h_w4:
+    _CET_ENDBR
     movu                xm1, [srcq+strideq*0]
     vinserti128          m1, [srcq+strideq*2], 1
     movu                xm2, [srcq+strideq*1]
@@ -906,6 +941,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
     jg .h_w4
     RET
 .h_w8:
+    _CET_ENDBR
     movu                xm0, [srcq+strideq*0]
     vinserti128          m0, [srcq+strideq*1], 1
     movu                xm1, [srcq+strideq*0+2]
@@ -922,6 +958,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     pmullw               m0, m4, [srcq+strideq*0]
     pmullw               m1, m5, [srcq+strideq*0+2]
     psubw                m0, m3
@@ -942,6 +979,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
 .h_w32:
 .h_w64:
 .h_w128:
+    _CET_ENDBR
     movifnidn           t0d, org_w
 .h_w32_loop0:
     mov                 r3d, t0d
@@ -966,6 +1004,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
     jg .h_w32_loop0
     RET
 .v:
+    _CET_ENDBR
     movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
     movd                xm5, mxyd
     vpbroadcastd         m4, [pw_16]
@@ -981,6 +1020,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
 .v_12bpc:
     jmp                  wq
 .v_w4:
+    _CET_ENDBR
     movq                xm0, [srcq+strideq*0]
 .v_w4_loop:
     vpbroadcastq         m2, [srcq+strideq*2]
@@ -1004,6 +1044,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     movu                xm0, [srcq+strideq*0]
 .v_w8_loop:
     vbroadcasti128       m2, [srcq+strideq*1]
@@ -1022,6 +1063,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
     jg .v_w8_loop
     RET
 .v_w16:
+    _CET_ENDBR
     movu                 m0, [srcq+strideq*0]
 .v_w16_loop:
     movu                 m2, [srcq+strideq*1]
@@ -1046,6 +1088,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
 .v_w32:
 .v_w64:
 .v_w128:
+    _CET_ENDBR
 %if WIN64
     PUSH                 r7
 %endif
@@ -1087,6 +1130,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
 %endif
     RET
 .hv:
+    _CET_ENDBR
     WIN64_SPILL_XMM       7
     movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
     shl                mxyd, 11
@@ -1096,6 +1140,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
     vpbroadcastw         m6, xm6
     jmp                  wq
 .hv_w4:
+    _CET_ENDBR
     movu                xm1, [srcq+strideq*0]
 %if WIN64
     movaps         [rsp+24], xmm7
@@ -1137,6 +1182,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
 %endif
     RET
 .hv_w8:
+    _CET_ENDBR
     pmullw              xm0, xm4, [srcq+strideq*0]
     pmullw              xm1, xm5, [srcq+strideq*0+2]
     psubw               xm0, xm3
@@ -1168,6 +1214,7 @@ cglobal prep_bilin_16bpc, 3, 7, 0, tmp, src, stride, w
 .hv_w32:
 .hv_w64:
 .hv_w128:
+    _CET_ENDBR
 %if WIN64
     PUSH                 r7
 %endif
@@ -1261,6 +1308,7 @@ cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     test                myd, 0xf00
     jnz .v
 .put:
+    _CET_ENDBR
     tzcnt                wd, wd
     movzx                wd, word [r8+wq*2+table_offset(put,)]
     add                  wq, r8
@@ -1269,6 +1317,7 @@ cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
 %endif
     jmp                  wq
 .h_w2:
+    _CET_ENDBR
     movzx               mxd, mxb
     sub                srcq, 2
     mova                xm2, [subpel_h_shuf2]
@@ -1294,6 +1343,7 @@ cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .h_w2_loop
     RET
 .h_w4:
+    _CET_ENDBR
     movzx               mxd, mxb
     sub                srcq, 2
     pmovsxbw            xm3, [base+subpel_filters+mxq*8]
@@ -1324,6 +1374,7 @@ cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .h_w4_loop
     RET
 .h:
+    _CET_ENDBR
     test                myd, 0xf00
     jnz .hv
     mov                 r7d, r8m
@@ -1346,6 +1397,7 @@ cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     sub                  wd, 16
     jge .h_w16
 .h_w8:
+    _CET_ENDBR
 %macro PUT_6TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
     pshufb              m%1, m6        ; 01 12 23 34
     pshufb              m%2, m6        ; 45 56 67 78
@@ -1383,6 +1435,7 @@ cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     mov                 r6d, wd
 .h_w16_loop:
     movu                 m0, [srcq+r6*2+ 0]
@@ -1398,6 +1451,7 @@ cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .h_w16
     RET
 .v:
+    _CET_ENDBR
     movzx               mxd, myb
     shr                 myd, 16
     cmp                  hd, 6
@@ -1417,6 +1471,7 @@ cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .v_w8
     je .v_w4
 .v_w2:
+    _CET_ENDBR
     movd                xm2, [srcq+r6 *2]
     pinsrd              xm2, [srcq+r6 *1], 1
     pinsrd              xm2, [srcq+ssq*0], 2
@@ -1450,6 +1505,7 @@ cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .v_w2_loop
     RET
 .v_w4:
+    _CET_ENDBR
     movq                xm1, [srcq+r6 *2]
     vpbroadcastq         m3, [srcq+r6 *1]
     vpbroadcastq         m2, [srcq+ssq*0]
@@ -1487,6 +1543,7 @@ cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     shl                  wd, 5
     WIN64_PUSH_XMM       12
     lea                  wd, [hq+wq-256]
@@ -1544,6 +1601,7 @@ cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .v_w8_loop0
     RET
 .hv:
+    _CET_ENDBR
     WIN64_SPILL_XMM      12, 16
     vpbroadcastd        m10, [pd_512]
     vpbroadcastw        m11, r8m
@@ -1624,6 +1682,7 @@ cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .hv_w2_loop
     RET
 .hv_w4:
+    _CET_ENDBR
     WIN64_PUSH_XMM       14
     vbroadcasti128      m12, [subpel_h_shufA]
     pshufd               m5, m6, q0000
@@ -1692,6 +1751,7 @@ cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     WIN64_PUSH_XMM       16, 12
     shr                 mxd, 16
     vbroadcasti128      m12, [subpel_h_shufA]
@@ -1837,6 +1897,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     test                myd, 0xf00
     jz mangle(private_prefix %+ _put_6tap_16bpc_avx2).put
 .v:
+    _CET_ENDBR
     movzx               mxd, myb
     shr                 myd, 16
     cmp                  hd, 6
@@ -1857,6 +1918,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .v_w8
     je .v_w4
 .v_w2:
+    _CET_ENDBR
     movd                xm2, [srcq+ssq*0]
     pinsrd              xm2, [srcq+ssq*1], 1
     pinsrd              xm2, [srcq+ssq*2], 2
@@ -1899,6 +1961,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .v_w2_loop
     RET
 .v_w4:
+    _CET_ENDBR
     movq                xm1, [srcq+ssq*0]
     vpbroadcastq         m0, [srcq+ssq*1]
     vpbroadcastq         m2, [srcq+ssq*2]
@@ -1945,6 +2008,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     shl                  wd, 5
     WIN64_PUSH_XMM       15
     lea                  wd, [hq+wq-256]
@@ -2014,6 +2078,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .v_w8_loop0
     RET
 .h:
+    _CET_ENDBR
     RESET_STACK_STATE
     test                myd, 0xf00
     jnz .hv
@@ -2039,6 +2104,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     sub                  wd, 16
     jge .h_w16
 .h_w8:
+    _CET_ENDBR
 %macro PUT_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
     pshufb              m%4, m%1, m7   ; 2 3 3 4 4 5 5 6
     pshufb              m%1, m6        ; 0 1 1 2 2 3 3 4
@@ -2081,6 +2147,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     mov                 r6d, wd
 .h_w16_loop:
     movu                 m0, [srcq+r6*2+ 0]
@@ -2096,6 +2163,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .h_w16
     RET
 .hv:
+    _CET_ENDBR
     WIN64_SPILL_XMM      16
     vpbroadcastw        m15, r8m
     cmp                  wd, 4
@@ -2192,6 +2260,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .hv_w2_loop
     RET
 .hv_w4:
+    _CET_ENDBR
     vbroadcasti128       m9, [subpel_h_shufA]
     vbroadcasti128      m10, [subpel_h_shufB]
     pshufd               m8, m7, q1111
@@ -2277,6 +2346,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, 
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     shr                 mxd, 16
     vpbroadcastq         m2, [base+subpel_filters+mxq*8]
     movzx               mxd, myb
@@ -2477,6 +2547,7 @@ cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, 
     test                myd, 0xf00
     jnz .v
 .prep:
+    _CET_ENDBR
     tzcnt                wd, wd
     mov                 r6d, r7m ; bitdepth_max
     movzx                wd, word [r7+wq*2+table_offset(prep,)]
@@ -2490,6 +2561,7 @@ cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, 
 %endif
     jmp                  wq
 .h_w4:
+    _CET_ENDBR
     movzx               mxd, mxb
     sub                srcq, 2
     pmovsxbw            xm0, [base+subpel_filters+mxq*8]
@@ -2531,6 +2603,7 @@ cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, 
     jg .h_w4_loop
     RET
 .h:
+    _CET_ENDBR
     test                myd, 0xf00
     jnz .hv
     vpbroadcastd         m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4)
@@ -2553,6 +2626,7 @@ cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, 
     cmp                  wd, 8
     jg .h_w16
 .h_w8:
+    _CET_ENDBR
     movu                xm0, [srcq+ssq*0+ 0]
     vinserti128          m0, [srcq+ssq*1+ 0], 1
     movu                xm2, [srcq+ssq*0+16]
@@ -2588,6 +2662,7 @@ cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, 
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     add                  wd, wd
 .h_w16_loop0:
     mov                 r6d, wd
@@ -2605,6 +2680,7 @@ cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, 
     jg .h_w16_loop0
     RET
 .v:
+    _CET_ENDBR
     movzx               mxd, myb
     shr                 myd, 16
     cmp                  hd, 4
@@ -2626,6 +2702,7 @@ cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, 
     cmp                  wd, 4
     jg .v_w8
 .v_w4:
+    _CET_ENDBR
     movq                xm1, [srcq+r6 *2]
     vpbroadcastq         m3, [srcq+r6 *1]
     vpbroadcastq         m2, [srcq+ssq*0]
@@ -2661,6 +2738,7 @@ cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, 
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     WIN64_PUSH_XMM       12
 %if WIN64
     push                 r8
@@ -2724,6 +2802,7 @@ cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, 
 %endif
     RET
 .hv:
+    _CET_ENDBR
     WIN64_SPILL_XMM      13, 15
     vpbroadcastd         m7, [prep_8tap_2d_rnd]
     vbroadcasti128       m8, [subpel_h_shufA]
@@ -2752,6 +2831,7 @@ cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, 
     pshufd              m11, m1, q1111
     pshufd              m12, m1, q2222
 .hv_w4:
+    _CET_ENDBR
     movu                xm2, [srcq+r6 *2]
     vinserti128          m2, [srcq+r6 *1], 1 ; 0 1
     pshufd               m5, m6, q0000
@@ -2816,6 +2896,7 @@ cglobal prep_6tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, 
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     shr                 mxd, 16
     vpbroadcastq         m2, [base+subpel_filters+1+mxq*8]
     movzx               mxd, myb
@@ -2960,6 +3041,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w,
     test                myd, 0xf00
     jz mangle(private_prefix %+ _prep_6tap_16bpc_avx2).prep
 .v:
+    _CET_ENDBR
     movzx               mxd, myb
     shr                 myd, 16
     cmp                  hd, 4
@@ -2982,6 +3064,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w,
     cmp                  wd, 4
     jg .v_w8
 .v_w4:
+    _CET_ENDBR
     movq                xm1, [srcq+strideq*0]
     vpbroadcastq         m0, [srcq+strideq*1]
     vpbroadcastq         m2, [srcq+strideq*2]
@@ -3026,6 +3109,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w,
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
 %if WIN64
     WIN64_PUSH_XMM       15
     push                 r8
@@ -3101,6 +3185,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w,
 %endif
     RET
 .h:
+    _CET_ENDBR
     test                myd, 0xf00
     jnz .hv
     vpbroadcastd         m5, [prep_8tap_1d_rnd] ; 8 - (8192 << 4)
@@ -3125,6 +3210,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w,
     cmp                  wd, 8
     jg .h_w16
 .h_w8:
+    _CET_ENDBR
 %macro PREP_8TAP_H 5 ; dst/src+0, src+8, src+16, tmp[1-2]
     pshufb              m%4, m%1, m7   ; 2 3 3 4 4 5 5 6
     pshufb              m%1, m6        ; 0 1 1 2 2 3 3 4
@@ -3165,6 +3251,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w,
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     add                  wd, wd
 .h_w16_loop0:
     mov                 r6d, wd
@@ -3182,6 +3269,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w,
     jg .h_w16_loop0
     RET
 .hv:
+    _CET_ENDBR
     WIN64_SPILL_XMM      16
     vpbroadcastd        m15, [prep_8tap_2d_rnd]
     cmp                  wd, 4
@@ -3210,6 +3298,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w,
     pshufd              m13, m1, q2222
     pshufd              m14, m1, q3333
 .hv_w4:
+    _CET_ENDBR
     vbroadcasti128       m9, [subpel_h_shufA]
     vbroadcasti128      m10, [subpel_h_shufB]
     pshufd               m8, m7, q1111
@@ -3293,6 +3382,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, stride, w,
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     shr                 mxd, 16
     vpbroadcastq         m2, [base+subpel_filters+mxq*8]
     movzx               mxd, myb
@@ -3646,6 +3736,7 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, 
     jmp                  wq
 %if isput
 .w2:
+    _CET_ENDBR
     mov                 myd, mym
     movzx               t0d, t0b
     sub                srcq, 2
@@ -3766,6 +3857,7 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, 
     jmp .w2_loop
 %endif
 .w4:
+    _CET_ENDBR
     mov                 myd, mym
     mova         [rsp+0x00], m12
 %if isput
@@ -3969,22 +4061,27 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, 
     SWAP                m13, m11
 %endif
 .w8:
+    _CET_ENDBR
     mov    dword [rsp+0x80], 1
     movifprep   tmp_stridem, 16
     jmp .w_start
 .w16:
+    _CET_ENDBR
     mov    dword [rsp+0x80], 2
     movifprep   tmp_stridem, 32
     jmp .w_start
 .w32:
+    _CET_ENDBR
     mov    dword [rsp+0x80], 4
     movifprep   tmp_stridem, 64
     jmp .w_start
 .w64:
+    _CET_ENDBR
     mov    dword [rsp+0x80], 8
     movifprep   tmp_stridem, 128
     jmp .w_start
 .w128:
+    _CET_ENDBR
     mov    dword [rsp+0x80], 16
     movifprep   tmp_stridem, 256
 .w_start:
@@ -4188,11 +4285,13 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, 
     SWAP                 m1, m12, m10
     SWAP                 m7, m11
 .dy1:
+    _CET_ENDBR
     movzx                wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
     add                  wq, base_reg
     jmp                  wq
 %if isput
 .dy1_w2:
+    _CET_ENDBR
     mov                 myd, mym
     movzx               t0d, t0b
     sub                srcq, 2
@@ -4291,6 +4390,7 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, 
     RET
 %endif
 .dy1_w4:
+    _CET_ENDBR
     mov                 myd, mym
 %if isput
     mova         [rsp+0x50], xm11
@@ -4455,22 +4555,27 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, 
     MC_8TAP_SCALED_RET
     SWAP                 m10, m13
 .dy1_w8:
+    _CET_ENDBR
     mov    dword [rsp+0xa0], 1
     movifprep   tmp_stridem, 16
     jmp .dy1_w_start
 .dy1_w16:
+    _CET_ENDBR
     mov    dword [rsp+0xa0], 2
     movifprep   tmp_stridem, 32
     jmp .dy1_w_start
 .dy1_w32:
+    _CET_ENDBR
     mov    dword [rsp+0xa0], 4
     movifprep   tmp_stridem, 64
     jmp .dy1_w_start
 .dy1_w64:
+    _CET_ENDBR
     mov    dword [rsp+0xa0], 8
     movifprep   tmp_stridem, 128
     jmp .dy1_w_start
 .dy1_w128:
+    _CET_ENDBR
     mov    dword [rsp+0xa0], 16
     movifprep   tmp_stridem, 256
 .dy1_w_start:
@@ -4652,11 +4757,13 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, 
     SWAP                 m1, m12, m10
     SWAP                 m7, m11
 .dy2:
+    _CET_ENDBR
     movzx                wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
     add                  wq, base_reg
     jmp                  wq
 %if isput
 .dy2_w2:
+    _CET_ENDBR
     mov                 myd, mym
     movzx               t0d, t0b
     sub                srcq, 2
@@ -4755,6 +4862,7 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, 
     RET
 %endif
 .dy2_w4:
+    _CET_ENDBR
     mov                 myd, mym
 %if isput
     mova         [rsp+0x50], xm11
@@ -4918,22 +5026,27 @@ cglobal prep_8tap_scaled_16bpc, 4, 14, 16, 0xe0, tmp, 
     MC_8TAP_SCALED_RET
     SWAP                m10, m13
 .dy2_w8:
+    _CET_ENDBR
     mov    dword [rsp+0xa0], 1
     movifprep   tmp_stridem, 16
     jmp .dy2_w_start
 .dy2_w16:
+    _CET_ENDBR
     mov    dword [rsp+0xa0], 2
     movifprep   tmp_stridem, 32
     jmp .dy2_w_start
 .dy2_w32:
+    _CET_ENDBR
     mov    dword [rsp+0xa0], 4
     movifprep   tmp_stridem, 64
     jmp .dy2_w_start
 .dy2_w64:
+    _CET_ENDBR
     mov    dword [rsp+0xa0], 8
     movifprep   tmp_stridem, 128
     jmp .dy2_w_start
 .dy2_w128:
+    _CET_ENDBR
     mov    dword [rsp+0xa0], 16
     movifprep   tmp_stridem, 256
 .dy2_w_start:
@@ -5325,6 +5438,7 @@ ALIGN function_align
     ret
 ALIGN function_align
 .h:
+    _CET_ENDBR
     lea               tmp1d, [mxq+alphaq*4]
     lea               tmp2d, [mxq+alphaq*1]
     movu               xm10, [srcq-6]
@@ -5378,6 +5492,7 @@ ALIGN function_align
     lea            stride3q, [strideq*3]
     jmp                  wq
 .w4:
+    _CET_ENDBR
     movq   [dstq          ], xm0
     movhps [dstq+strideq*1], xm0
     vextracti128        xm0, m0, 1
@@ -5408,6 +5523,7 @@ ALIGN function_align
 .ret:
     RET
 .w8:
+    _CET_ENDBR
     mova         [dstq+strideq*0], xm0
     vextracti128 [dstq+strideq*1], m0, 1
     mova         [dstq+strideq*2], xm1
@@ -5435,6 +5551,7 @@ ALIGN function_align
     call .main
     lea                dstq, [dstq+strideq*4]
 .w16:
+    _CET_ENDBR
     mova   [dstq+strideq*0], m0
     mova   [dstq+strideq*1], m1
     mova   [dstq+strideq*2], m2
@@ -5446,6 +5563,7 @@ ALIGN function_align
     call .main
     lea                dstq, [dstq+strideq*2]
 .w32:
+    _CET_ENDBR
     mova [dstq+strideq*0+32*0], m0
     mova [dstq+strideq*0+32*1], m1
     mova [dstq+strideq*1+32*0], m2
@@ -5457,6 +5575,7 @@ ALIGN function_align
     call .main
     add                dstq, strideq
 .w64:
+    _CET_ENDBR
     mova        [dstq+32*0], m0
     mova        [dstq+32*1], m1
     mova        [dstq+32*2], m2
@@ -5468,6 +5587,7 @@ ALIGN function_align
     call .main
     add                dstq, strideq
 .w128:
+    _CET_ENDBR
     mova        [dstq+32*0], m0
     mova        [dstq+32*1], m1
     mova        [dstq+32*2], m2
@@ -5665,6 +5785,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1,
     lea            stride3q, [strideq*3]
     jmp                  wq
 .w4:
+    _CET_ENDBR
     phaddd               m4, m5
     paddw                m4, m14
     psrlw                m4, 2
@@ -5705,6 +5826,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1,
     lea                dstq, [dstq+strideq*4]
     add               maskq, 16
 .w8:
+    _CET_ENDBR
     vperm2i128           m6, m4, m5, 0x21
     vpblendd             m4, m5, 0xf0
     paddw                m4, m14
@@ -5732,6 +5854,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1,
     lea                dstq, [dstq+strideq*4]
     add               maskq, 16
 .w16:
+    _CET_ENDBR
     punpcklqdq           m6, m4, m5
     punpckhqdq           m4, m5
     paddw                m6, m14
@@ -5753,6 +5876,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1,
     lea                dstq, [dstq+strideq*4]
     add               maskq, 32
 .w32:
+    _CET_ENDBR
     paddw                m4, m14
     paddw                m4, m5
     psrlw               m15, m4, 2
@@ -5780,6 +5904,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1,
     lea                dstq, [dstq+strideq*2]
     add               maskq, 32
 .w64:
+    _CET_ENDBR
     paddw                m4, m14
     paddw               m15, m14, m5
     mova [dstq+strideq*0+32*0], m0
@@ -5808,6 +5933,7 @@ cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1,
     lea                dstq, [dstq+strideq*2]
     add               maskq, 64
 .w128:
+    _CET_ENDBR
     paddw                m4, m14
     paddw                m5, m14
     mova [dstq+strideq*0+32*0], m0
@@ -5906,6 +6032,7 @@ cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1,
     lea            stride3q, [strideq*3]
     jmp                  wq
 .w4:
+    _CET_ENDBR
     movq   [dstq+strideq*0], xm0
     movhps [dstq+strideq*1], xm0
     vextracti128        xm0, m0, 1
@@ -5938,6 +6065,7 @@ cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1,
     call .main
     lea                dstq, [dstq+strideq*4]
 .w8:
+    _CET_ENDBR
     mova         [dstq+strideq*0], xm0
     vextracti128 [dstq+strideq*1], m0, 1
     mova         [dstq+strideq*2], xm1
@@ -5956,6 +6084,7 @@ cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1,
     call .main
     lea                dstq, [dstq+strideq*4]
 .w16:
+    _CET_ENDBR
     mova   [dstq+strideq*0], m0
     mova   [dstq+strideq*1], m1
     mova   [dstq+strideq*2], m2
@@ -5967,6 +6096,7 @@ cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1,
     call .main
     lea                dstq, [dstq+strideq*2]
 .w32:
+    _CET_ENDBR
     mova [dstq+strideq*0+32*0], m0
     mova [dstq+strideq*0+32*1], m1
     mova [dstq+strideq*1+32*0], m2
@@ -5978,6 +6108,7 @@ cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1,
     call .main
     add                dstq, strideq
 .w64:
+    _CET_ENDBR
     mova        [dstq+32*0], m0
     mova        [dstq+32*1], m1
     mova        [dstq+32*2], m2
@@ -5989,6 +6120,7 @@ cglobal w_mask_422_16bpc, 4, 8, 16, dst, stride, tmp1,
     call .main
     add                dstq, strideq
 .w128:
+    _CET_ENDBR
     mova        [dstq+32*0], m0
     mova        [dstq+32*1], m1
     mova        [dstq+32*2], m2
@@ -6038,6 +6170,7 @@ cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1,
     lea            stride3q, [strideq*3]
     jmp                  wq
 .w4:
+    _CET_ENDBR
     movq   [dstq+strideq*0], xm0
     movhps [dstq+strideq*1], xm0
     vextracti128        xm0, m0, 1
@@ -6071,6 +6204,7 @@ cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1,
     call .main
     lea                dstq, [dstq+strideq*4]
 .w8:
+    _CET_ENDBR
     mova         [dstq+strideq*0], xm0
     vextracti128 [dstq+strideq*1], m0, 1
     mova         [dstq+strideq*2], xm1
@@ -6083,6 +6217,7 @@ cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1,
     call .main
     lea                dstq, [dstq+strideq*2]
 .w16:
+    _CET_ENDBR
     mova   [dstq+strideq*0], m0
     mova   [dstq+strideq*1], m1
     sub                  hd, 2
@@ -6092,6 +6227,7 @@ cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1,
     call .main
     add                dstq, strideq
 .w32:
+    _CET_ENDBR
     mova        [dstq+32*0], m0
     mova        [dstq+32*1], m1
     dec                  hd
@@ -6101,6 +6237,7 @@ cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1,
     call .main
     add                dstq, strideq
 .w64:
+    _CET_ENDBR
     mova        [dstq+32*0], m0
     mova        [dstq+32*1], m1
     call .main
@@ -6113,6 +6250,7 @@ cglobal w_mask_444_16bpc, 4, 8, 11, dst, stride, tmp1,
     call .main
     add                dstq, strideq
 .w128:
+    _CET_ENDBR
     mova        [dstq+32*0], m0
     mova        [dstq+32*1], m1
     call .main
@@ -6157,6 +6295,7 @@ cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
     lea                  r6, [dsq*3]
     jmp                  wq
 .w4:
+    _CET_ENDBR
     pmovzxbw             m3, [maskq]
     movq                xm0, [dstq+dsq*0]
     movhps              xm0, [dstq+dsq*1]
@@ -6180,6 +6319,7 @@ cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
     jg .w4
     RET
 .w8:
+    _CET_ENDBR
     pmovzxbw             m4, [maskq+16*0]
     pmovzxbw             m5, [maskq+16*1]
     mova                xm0, [dstq+dsq*0]
@@ -6205,6 +6345,7 @@ cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
     jg .w8
     RET
 .w16:
+    _CET_ENDBR
     pmovzxbw             m4, [maskq+16*0]
     pmovzxbw             m5, [maskq+16*1]
     mova                 m0,     [dstq+dsq*0]
@@ -6226,6 +6367,7 @@ cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
     jg .w16
     RET
 .w32:
+    _CET_ENDBR
     pmovzxbw             m4, [maskq+16*0]
     pmovzxbw             m5, [maskq+16*1]
     mova                 m0,     [dstq+32*0]
@@ -6257,6 +6399,7 @@ cglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h
     add                  wq, r5
     jmp                  wq
 .w2:
+    _CET_ENDBR
     vpbroadcastd         m2, [base+obmc_masks_avx2+2*2]
 .w2_loop:
     movd                 m0, [dstq+dsq*0]
@@ -6273,6 +6416,7 @@ cglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h
     jg .w2_loop
     RET
 .w4:
+    _CET_ENDBR
     vpbroadcastq         m2, [base+obmc_masks_avx2+4*2]
 .w4_loop:
     movq                 m0, [dstq+dsq*0]
@@ -6289,6 +6433,7 @@ cglobal blend_v_16bpc, 3, 6, 6, dst, ds, tmp, w, h
     RET
 INIT_YMM avx2
 .w8:
+    _CET_ENDBR
     vbroadcasti128       m2, [base+obmc_masks_avx2+8*2]
 .w8_loop:
     mova                xm0, [dstq+dsq*0]
@@ -6304,6 +6449,7 @@ INIT_YMM avx2
     jg .w8_loop
     RET
 .w16:
+    _CET_ENDBR
     mova                 m4, [base+obmc_masks_avx2+16*2]
 .w16_loop:
     mova                 m0,     [dstq+dsq*0]
@@ -6322,6 +6468,7 @@ INIT_YMM avx2
     jg .w16_loop
     RET
 .w32:
+    _CET_ENDBR
 %if WIN64
     movaps         [rsp+ 8], xmm6
     movaps         [rsp+24], xmm7
@@ -6389,6 +6536,7 @@ cglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, ma
     neg                  hq
     jmp                  wq
 .w2:
+    _CET_ENDBR
     movd                 m0, [dstq+dsq*0]
     pinsrd               m0, [dstq+dsq*1], 1
     movd                 m2, [maskq+hq*2]
@@ -6405,6 +6553,7 @@ cglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, ma
     jl .w2
     RET
 .w4:
+    _CET_ENDBR
     mova                 m3, [blend_shuf]
 .w4_loop:
     movq                 m0, [dstq+dsq*0]
@@ -6423,6 +6572,7 @@ cglobal blend_h_16bpc, 3, 6, 6, dst, ds, tmp, w, h, ma
     RET
 INIT_YMM avx2
 .w8:
+    _CET_ENDBR
     vbroadcasti128       m3, [blend_shuf]
     shufpd               m3, m3, 0x0c
 .w8_loop:
@@ -6441,6 +6591,7 @@ INIT_YMM avx2
     jl .w8_loop
     RET
 .w16:
+    _CET_ENDBR
     vpbroadcastw         m4, [maskq+hq*2]
     vpbroadcastw         m5, [maskq+hq*2+2]
     mova                 m0,     [dstq+dsq*0]
@@ -6459,6 +6610,7 @@ INIT_YMM avx2
     jl .w16
     RET
 .w32:
+    _CET_ENDBR
     vpbroadcastw         m4, [maskq+hq*2]
     BLEND_H_ROW           0, 0, 2
     add                dstq, dsq
@@ -6466,6 +6618,7 @@ INIT_YMM avx2
     jl .w32
     RET
 .w64:
+    _CET_ENDBR
     vpbroadcastw         m4, [maskq+hq*2]
     BLEND_H_ROW           0, 0
     BLEND_H_ROW           2, 2, 4
@@ -6474,6 +6627,7 @@ INIT_YMM avx2
     jl .w64
     RET
 .w128:
+    _CET_ENDBR
     vpbroadcastw         m4, [maskq+hq*2]
     BLEND_H_ROW           0,  0
     BLEND_H_ROW           2,  2, 8
