Index: src/x86/mc_avx512.asm
--- src/x86/mc_avx512.asm.orig
+++ src/x86/mc_avx512.asm
@@ -301,10 +301,12 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     test               mxyd, mxyd
     jnz .v
 .put:
+    _CET_ENDBR
     movzx                wd, word [r7+wq*2+table_offset(put,)]
     add                  wq, r7
     jmp                  wq
 .put_w2:
+    _CET_ENDBR
     movzx               r6d, word [srcq+ssq*0]
     movzx               r7d, word [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -315,6 +317,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .put_w2
     RET
 .put_w4:
+    _CET_ENDBR
     mov                 r6d, [srcq+ssq*0]
     mov                 r7d, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -325,6 +328,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .put_w4
     RET
 .put_w8:
+    _CET_ENDBR
     mov                  r6, [srcq+ssq*0]
     mov                  r7, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -335,6 +339,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .put_w8
     RET
 .put_w16:
+    _CET_ENDBR
     movu               xmm0, [srcq+ssq*0]
     movu               xmm1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -345,6 +350,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .put_w16
     RET
 .put_w32:
+    _CET_ENDBR
     movu                ym0, [srcq+ssq*0]
     movu                ym1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -355,6 +361,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .put_w32
     RET
 .put_w64:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0]
     movu                 m1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -365,6 +372,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .put_w64
     RET
 .put_w128:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0+64*0]
     movu                 m1, [srcq+ssq*0+64*1]
     movu                 m2, [srcq+ssq*1+64*0]
@@ -379,6 +387,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .put_w128
     RET
 .h:
+    _CET_ENDBR
     ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
     ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
     imul               mxyd, 255
@@ -393,6 +402,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     add                  wq, r7
     jmp                  wq
 .h_w2:
+    _CET_ENDBR
     movd               xmm0, [srcq+ssq*0]
     pinsrd             xmm0, [srcq+ssq*1], 1
     lea                srcq, [srcq+ssq*2]
@@ -407,6 +417,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .h_w2
     RET
 .h_w4:
+    _CET_ENDBR
     mova               xmm4, [bilin_h_shuf4]
 .h_w4_loop:
     movq               xmm0, [srcq+ssq*0]
@@ -423,6 +434,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .h_w4_loop
     RET
 .h_w8:
+    _CET_ENDBR
     movu                xm0, [srcq+ssq*0]
     vinserti32x4        ym0, [srcq+ssq*1], 1
     lea                srcq, [srcq+ssq*2]
@@ -437,6 +449,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     mova                 m4, [bilin_h_perm16]
 .h_w16_loop:
     movu                ym0, [srcq+ssq*0]
@@ -453,6 +466,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .h_w16_loop
     RET
 .h_w32:
+    _CET_ENDBR
     movu                ym0, [srcq+ssq*0+8*0]
     vinserti32x8         m0, [srcq+ssq*1+8*0], 1
     movu                ym1, [srcq+ssq*0+8*1]
@@ -472,6 +486,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .h_w32
     RET
 .h_w64:
+    _CET_ENDBR
     movu                 m0, [srcq+8*0]
     movu                 m1, [srcq+8*1]
     pshufb               m0, m4
@@ -488,6 +503,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .h_w64
     RET
 .h_w128:
+    _CET_ENDBR
     movu                 m0, [srcq+8*0]
     movu                 m2, [srcq+8*1]
     movu                 m1, [srcq+8*8]
@@ -505,6 +521,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .h_w128
     RET
 .v:
+    _CET_ENDBR
     movzx                wd, word [r7+wq*2+table_offset(put, _bilin_v)]
     imul               mxyd, 255
     vpbroadcastd         m5, [pw_2048]
@@ -513,6 +530,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     vpbroadcastw         m4, mxyd
     jmp                  wq
 .v_w2:
+    _CET_ENDBR
     movd               xmm0,       [srcq+ssq*0]
 .v_w2_loop:
     pinsrw             xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1
@@ -530,6 +548,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .v_w2_loop
     RET
 .v_w4:
+    _CET_ENDBR
     movd               xmm0, [srcq+ssq*0]
 .v_w4_loop:
     vpbroadcastd       xmm2, [srcq+ssq*1]
@@ -548,6 +567,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     movq               xmm0, [srcq+ssq*0]
 .v_w8_loop:
     movq               xmm2, [srcq+ssq*1]
@@ -567,6 +587,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .v_w8_loop
     RET
 .v_w16:
+    _CET_ENDBR
     movu               xmm0, [srcq+ssq*0]
 .v_w16_loop:
     vbroadcasti128     ymm3, [srcq+ssq*1]
@@ -589,6 +610,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     vzeroupper
     RET
 .v_w32:
+    _CET_ENDBR
     movu                ym0, [srcq+ssq*0]
     kxnorb               k1, k1, k1
 .v_w32_loop:
@@ -611,6 +633,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .v_w32_loop
     RET
 .v_w64:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0]
 .v_w64_loop:
     movu                 m3, [srcq+ssq*1]
@@ -634,6 +657,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .v_w64_loop
     RET
 .v_w128:
+    _CET_ENDBR
     movu                 m0, [srcq+64*0]
     movu                 m1, [srcq+64*1]
 .v_w128_loop:
@@ -660,6 +684,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .v_w128_loop
     RET
 .hv:
+    _CET_ENDBR
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
     ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
     movzx                wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
@@ -670,6 +695,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     vpbroadcastw         m6, mxyd
     jmp                  wq
 .hv_w2:
+    _CET_ENDBR
     vpbroadcastd       xmm0, [srcq+ssq*0]
     pshufb             xmm0, xm4
     pmaddubsw          xmm0, xm5
@@ -694,6 +720,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .hv_w2_loop
     RET
 .hv_w4:
+    _CET_ENDBR
     mova               xmm4, [bilin_h_shuf4]
     movddup            xmm0, [srcq+ssq*0]
     pshufb             xmm0, xmm4
@@ -719,6 +746,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     vbroadcasti128      ym0, [srcq+ssq*0]
     pshufb              ym0, ym4
     pmaddubsw           ym0, ym5
@@ -743,6 +771,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .hv_w8_loop
     RET
 .hv_w16:
+    _CET_ENDBR
     vbroadcasti32x8      m0, [srcq+ssq*0]
     mova                 m4, [bilin_h_perm16]
     vpermb               m0, m4, m0
@@ -768,6 +797,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .hv_w16_loop
     RET
 .hv_w32:
+    _CET_ENDBR
     mova                 m4, [bilin_h_perm32]
     vpermb               m0, m4, [srcq+ssq*0]
     pmovzxbq             m8, [pb_02461357]
@@ -797,6 +827,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .hv_w32_loop
     RET
 .hv_w64:
+    _CET_ENDBR
     movu                 m0, [srcq+8*0]
     movu                 m1, [srcq+8*1]
     pshufb               m0, m4
@@ -830,6 +861,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .hv_w64_loop
     RET
 .hv_w128:
+    _CET_ENDBR
     movu                 m0, [srcq+8*0]
     movu                 m1, [srcq+8*1]
     movu                 m2, [srcq+8*8]
@@ -890,11 +922,13 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     test               mxyd, mxyd
     jnz .v
 .prep:
+    _CET_ENDBR
     movzx                wd, word [t2+wq*2+table_offset(prep,)]
     add                  wq, t2
     lea            stride3q, [strideq*3]
     jmp                  wq
 .prep_w4:
+    _CET_ENDBR
     movd               xmm0, [srcq+strideq*0]
     pinsrd             xmm0, [srcq+strideq*1], 1
     pinsrd             xmm0, [srcq+strideq*2], 2
@@ -908,6 +942,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .prep_w4
     RET
 .prep_w8:
+    _CET_ENDBR
     movq               xmm0, [srcq+strideq*0]
     movq               xmm1, [srcq+strideq*1]
     vinserti128         ym0, ymm0, [srcq+strideq*2], 1
@@ -922,6 +957,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .prep_w8
     RET
 .prep_w16:
+    _CET_ENDBR
     movu               xmm0, [srcq+strideq*0]
     vinserti128         ym0, ymm0, [srcq+strideq*1], 1
     movu               xmm1, [srcq+strideq*2]
@@ -938,6 +974,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .prep_w16
     RET
 .prep_w32:
+    _CET_ENDBR
     pmovzxbw             m0, [srcq+strideq*0]
     pmovzxbw             m1, [srcq+strideq*1]
     pmovzxbw             m2, [srcq+strideq*2]
@@ -953,6 +990,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .prep_w32
     RET
 .prep_w64:
+    _CET_ENDBR
     pmovzxbw             m0, [srcq+strideq*0+32*0]
     pmovzxbw             m1, [srcq+strideq*0+32*1]
     pmovzxbw             m2, [srcq+strideq*1+32*0]
@@ -968,6 +1006,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .prep_w64
     RET
 .prep_w128:
+    _CET_ENDBR
     pmovzxbw             m0, [srcq+32*0]
     pmovzxbw             m1, [srcq+32*1]
     pmovzxbw             m2, [srcq+32*2]
@@ -983,6 +1022,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .prep_w128
     RET
 .h:
+    _CET_ENDBR
     ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
     ; = (16 - mx) * src[x] + mx * src[x + 1]
     imul               mxyd, 255
@@ -996,6 +1036,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     lea            stride3q, [strideq*3]
     jmp                  wq
 .h_w4:
+    _CET_ENDBR
     vbroadcasti32x4     ym4, [bilin_h_shuf4]
 .h_w4_loop:
     movq               xmm0, [srcq+strideq*0]
@@ -1012,6 +1053,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .h_w4_loop
     RET
 .h_w8:
+    _CET_ENDBR
     vbroadcasti32x4      m4, [bilin_h_perm16]
 .h_w8_loop:
     movu               xmm0, [srcq+strideq*0]
@@ -1027,6 +1069,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .h_w8_loop
     RET
 .h_w16:
+    _CET_ENDBR
     mova                 m4, [bilin_h_perm16]
 .h_w16_loop:
     movu                ym0, [srcq+strideq*0]
@@ -1045,6 +1088,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .h_w16_loop
     RET
 .h_w32:
+    _CET_ENDBR
     mova                 m4, [bilin_h_perm32]
 .h_w32_loop:
     vpermb               m0, m4, [srcq+strideq*0]
@@ -1065,6 +1109,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .h_w32_loop
     RET
 .h_w64:
+    _CET_ENDBR
     mova                 m4, [bilin_h_perm32]
 .h_w64_loop:
     vpermb               m0, m4, [srcq+strideq*0+32*0]
@@ -1085,6 +1130,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .h_w64_loop
     RET
 .h_w128:
+    _CET_ENDBR
     mova                 m4, [bilin_h_perm32]
 .h_w128_loop:
     vpermb               m0, m4, [srcq+32*0]
@@ -1105,6 +1151,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .h_w128_loop
     RET
 .v:
+    _CET_ENDBR
     WIN64_SPILL_XMM       7
     movzx                wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
     imul               mxyd, 255
@@ -1114,6 +1161,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     vpbroadcastw         m6, mxyd
     jmp                  wq
 .v_w4:
+    _CET_ENDBR
     vpbroadcastd        xm0, [srcq+strideq*0]
     mov                 r3d, 0x29
     vbroadcasti32x4     ym3, [bilin_v_shuf4]
@@ -1133,6 +1181,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     mova                 m5, [bilin_v_perm8]
     vbroadcasti32x4     ym0, [srcq+strideq*0]
 .v_w8_loop:
@@ -1149,6 +1198,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .v_w8_loop
     RET
 .v_w16:
+    _CET_ENDBR
     mova                 m5, [bilin_v_perm16]
     movu                xm0, [srcq+strideq*0]
 .v_w16_loop:
@@ -1168,6 +1218,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .v_w16_loop
     RET
 .v_w32:
+    _CET_ENDBR
     mova                 m5, [bilin_v_perm32]
     movu                ym0, [srcq+strideq*0]
 .v_w32_loop:
@@ -1193,6 +1244,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .v_w32_loop
     RET
 .v_w64:
+    _CET_ENDBR
     mova                 m5, [bilin_v_perm64]
     vpermq               m0, m5, [srcq+strideq*0]
 .v_w64_loop:
@@ -1216,6 +1268,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .v_w64_loop
     RET
 .v_w128:
+    _CET_ENDBR
     mova                 m5, [bilin_v_perm64]
     vpermq               m0, m5, [srcq+strideq*0+ 0]
     vpermq               m1, m5, [srcq+strideq*0+64]
@@ -1254,6 +1307,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .v_w128_loop
     RET
 .hv:
+    _CET_ENDBR
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
     ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
     WIN64_SPILL_XMM       7
@@ -1264,6 +1318,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     lea            stride3q, [strideq*3]
     jmp                  wq
 .hv_w4:
+    _CET_ENDBR
     vbroadcasti32x4     ym4, [bilin_h_shuf4]
     vpbroadcastq        ym0, [srcq+strideq*0]
     pshufb              ym0, ym4
@@ -1288,6 +1343,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     vbroadcasti32x4      m4, [bilin_h_perm16]
     vbroadcasti32x4      m0, [srcq+strideq*0]
     pshufb               m0, m4
@@ -1311,6 +1367,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .hv_w8_loop
     RET
 .hv_w16:
+    _CET_ENDBR
     mova                 m4, [bilin_h_perm16]
     vbroadcasti32x8      m0, [srcq+strideq*0]
     vpermb               m0, m4, m0
@@ -1340,6 +1397,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .hv_w16_loop
     RET
 .hv_w32:
+    _CET_ENDBR
     mova                 m4, [bilin_h_perm32]
     vpermb               m0, m4, [srcq+strideq*0]
     pmaddubsw            m0, m5
@@ -1362,6 +1420,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .hv_w32_loop
     RET
 .hv_w64:
+    _CET_ENDBR
     mova                 m4, [bilin_h_perm32]
     vpermb               m0, m4, [srcq+32*0]
     vpermb               m1, m4, [srcq+32*1]
@@ -1388,6 +1447,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .hv_w64_loop
     RET
 .hv_w128:
+    _CET_ENDBR
     mova                 m4, [bilin_h_perm32]
     vpermb               m0, m4, [srcq+32*0]
     vpermb               m1, m4, [srcq+32*1]
@@ -1493,6 +1553,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     test                myd, 0xf00
     jnz .v
 .put:
+    _CET_ENDBR
     tzcnt                wd, wd
     movzx                wd, word [r8+wq*2+table_offset(put,)]
     add                  wq, r8
@@ -1503,6 +1564,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
 %endif
     jmp                  wq
 .v:
+    _CET_ENDBR
     movzx               mxd, myb
     shr                 myd, 16
     cmp                  hd, 6
@@ -1519,6 +1581,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     neg                 nsq
     jmp                  r6
 .v_w2:
+    _CET_ENDBR
     movd               xmm2, [srcq+nsq*2]
     pinsrw             xmm2, [srcq+nsq*1], 2
     pinsrw             xmm2, [srcq+ssq*0], 4
@@ -1550,6 +1613,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .v_w2_loop
     RET
 .v_w4:
+    _CET_ENDBR
     movd               xmm2, [srcq+nsq*2]
     pinsrd             xmm2, [srcq+nsq*1], 1
     pinsrd             xmm2, [srcq+ssq*0], 2
@@ -1581,6 +1645,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     movq               xmm1, [srcq+nsq*2]
     vpbroadcastq       ymm3, [srcq+nsq*1]
     vpbroadcastq       ymm2, [srcq+ssq*0]
@@ -1617,6 +1682,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     vzeroupper
     RET
 .v_w16:
+    _CET_ENDBR
     mova                 m5, [spel_v_perm16a]
     vbroadcasti32x4      m1, [srcq+nsq*2]
     vbroadcasti32x4     ym3, [srcq+nsq*1]
@@ -1653,6 +1719,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .v_w16_loop
     RET
 .v_w32:
+    _CET_ENDBR
     mova                m10, [spel_v_perm32]
     pmovzxbq             m5, [pb_02461357]
     vpshrdw             m11, m10, m10, 8
@@ -1696,6 +1763,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     RET
 .v_w64:
 .v_w128:
+    _CET_ENDBR
     lea                 r6d, [hq+wq*4-256]
 .v_loop0:
     movu                 m2, [srcq+nsq*2]
@@ -1763,9 +1831,11 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .v_loop0
     RET
 .h:
+    _CET_ENDBR
     test                myd, 0xf00
     jz mangle(private_prefix %+ _put_8tap_8bpc_avx512icl).h2
 .hv:
+    _CET_ENDBR
     vpbroadcastd         m9, [pd_34]
     mova               xm10, [spel_hv_end]
     pxor                xm0, xm0
@@ -1834,6 +1904,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .hv_w2_loop
     RET
 .hv_w4:
+    _CET_ENDBR
     movq                xm2, [srcq+nsq*2]
     vpbroadcastq        ym1, [srcq+nsq*1]
     vinserti32x4        ym2, [srcq+ssq*0], 1
@@ -1878,6 +1949,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     shr                 mxd, 16
     sub                srcq, 3
     vpbroadcastd        m11, [base+subpel_filters+mxq*8+0]
@@ -1957,6 +2029,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .hv_w8_loop
     RET
 .hv_w16:
+    _CET_ENDBR
     movu                m19, [spel_hv_perm16a]
     vpbroadcastd         m7, [pb_4]
     lea                 r6d, [wq*2-32]
@@ -2058,6 +2131,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     test                myd, 0xf00
     jz mangle(private_prefix %+ _put_6tap_8bpc_avx512icl).put
 .v:
+    _CET_ENDBR
     movzx               mxd, myb
     shr                 myd, 16
     cmp                  hd, 6
@@ -2075,6 +2149,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     vpbroadcastw        m11, [myq+6]
     jmp                  r6
 .v_w2:
+    _CET_ENDBR
     movd               xmm2, [srcq+ssq*0]
     pinsrw             xmm2, [srcq+ssq*1], 2
     pinsrw             xmm2, [srcq+ssq*2], 4
@@ -2115,6 +2190,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .v_w2_loop
     RET
 .v_w4:
+    _CET_ENDBR
     movd               xmm2, [srcq+ssq*0]
     pinsrd             xmm2, [srcq+ssq*1], 1
     pinsrd             xmm2, [srcq+ssq*2], 2
@@ -2155,6 +2231,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     movq               xmm1, [srcq+ssq*0]
     vpbroadcastq       ymm0, [srcq+ssq*1]
     vpbroadcastq       ymm2, [srcq+ssq*2]
@@ -2200,6 +2277,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     vzeroupper
     RET
 .v_w16:
+    _CET_ENDBR
     mova                m12, [spel_v_perm16a]
     vbroadcasti32x4      m1, [srcq+ssq*0]
     vbroadcasti32x4     ym4, [srcq+ssq*1]
@@ -2244,6 +2322,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .v_w16_loop
     RET
 .v_w32:
+    _CET_ENDBR
     mova                m12, [spel_v_perm32]
     pmovzxbq            m14, [pb_02461357]
     vpshrdw             m13, m12, m12, 8
@@ -2299,6 +2378,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     RET
 .v_w64:
 .v_w128:
+    _CET_ENDBR
     lea                 r6d, [hq+wq*4-256]
     mov                  r4, srcq
     mov                  r7, dstq
@@ -2389,6 +2469,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     vzeroupper
     RET
 .h:
+    _CET_ENDBR
     test                myd, 0xf00
     jnz .hv
 .h2:
@@ -2408,6 +2489,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     add                  wq, r8
     jmp                  wq
 .h_w2:
+    _CET_ENDBR
     movzx               mxd, mxb
     dec                srcq
     mova               xmm4, [subpel_h_shuf4]
@@ -2429,6 +2511,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .h_w2_loop
     RET
 .h_w4:
+    _CET_ENDBR
     movzx               mxd, mxb
     dec                srcq
     vpbroadcastd       xmm3, [base+mxq*8+subpel_filters+2]
@@ -2452,6 +2535,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .h_w4_loop
     RET
 .h_w8:
+    _CET_ENDBR
     movu                xm0, [srcq+ssq*0]
     vinserti32x4        ym0, [srcq+ssq*1], 1
     lea                srcq, [srcq+ssq*2]
@@ -2464,6 +2548,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     mova                 m6, [spel_h_perm16]
     vpbroadcastd         m8, [pb_4]
     paddb                m7, m8, m6
@@ -2481,6 +2566,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .h_w16_loop
     RET
 .h_w32:
+    _CET_ENDBR
     movu                ym0, [srcq+ssq*0+8*0]
     vinserti32x8         m0, [srcq+ssq*1+8*0], 1
     movu                ym1, [srcq+ssq*0+8*1]
@@ -2496,6 +2582,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .h_w32
     RET
 .h_w64:
+    _CET_ENDBR
     movu                 m0, [srcq+8*0]
     movu                 m1, [srcq+8*1]
     add                srcq, ssq
@@ -2508,6 +2595,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .h_w64
     RET
 .h_w128:
+    _CET_ENDBR
     movu                 m0, [srcq+8*0]
     movu                 m2, [srcq+8*1]
     movu                 m1, [srcq+8*8]
@@ -2526,6 +2614,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .h_w128
     RET
 .hv:
+    _CET_ENDBR
     vpbroadcastd         m9, [pd_34]
     pxor                xm0, xm0
     cmp                  wd, 4
@@ -2604,6 +2693,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     vzeroupper
     RET
 .hv_w4:
+    _CET_ENDBR
     movq               xmm1, [r6+ssq*0]
     vpbroadcastq        ym2, [r6+ssq*1]
     vinserti32x4        ym1, ymm1, [r6+ssq*2], 1
@@ -2654,6 +2744,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     shr                 mxd, 16
     sub                srcq, 3
     vpbroadcastd        m10, [base+subpel_filters+mxq*8+0]
@@ -2746,6 +2837,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     vzeroupper
     RET
 .hv_w16:
+    _CET_ENDBR
     WIN64_SPILL_XMM      23
     movu                m22, [spel_hv_perm16a]
     sub                srcq, ss3q
@@ -2879,6 +2971,7 @@ cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, m
     test                myd, 0xf00
     jnz .v
 .prep:
+    _CET_ENDBR
     tzcnt                wd, wd
     movzx                wd, word [r7+wq*2+table_offset(prep,)]
     add                  wq, r7
@@ -2888,6 +2981,7 @@ cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, m
 %endif
     jmp                  wq
 .v:
+    _CET_ENDBR
     movzx               mxd, myb
     shr                 myd, 16
     cmp                  hd, 4
@@ -2905,6 +2999,7 @@ cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, m
     sub                srcq, ssq
     jmp                  r5
 .v_w4:
+    _CET_ENDBR
     movd               xmm2, [srcq+ssq*0]
     pinsrd             xmm2, [srcq+ssq*1], 1
     vpbroadcastd       ymm1, [srcq+ssq*2]
@@ -2941,6 +3036,7 @@ cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, m
     vzeroupper
     RET
 .v_w8:
+    _CET_ENDBR
     mova                 m6, [spel_v_perm8]
     movq                xm1, [srcq+ssq*0]
     mov                 r6d, 0x3e
@@ -2976,6 +3072,7 @@ cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, m
     jg .v_w8_loop
     RET
 .v_w16:
+    _CET_ENDBR
     mova                m11, [spel_v_perm16b]
     vbroadcasti32x4      m1, [srcq+ssq*0]
     mov                 r6d, 0x0f
@@ -3018,6 +3115,7 @@ cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, m
     jg .v_w16_loop
     RET
 .v_w32:
+    _CET_ENDBR
     movshdup             m6, [bilin_v_perm64]
     movu               ym16, [srcq+ssq*0]
     movu               ym17, [srcq+ssq*1]
@@ -3065,6 +3163,7 @@ cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, m
     RET
 .v_w64:
 .v_w128:
+    _CET_ENDBR
     mova                 m6, [bilin_v_perm64]
     add                  wd, wd
     lea                 r6d, [hq+wq]
@@ -3133,9 +3232,11 @@ cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, m
     vzeroupper
     RET
 .h:
+    _CET_ENDBR
     test                myd, 0xf00
     jz mangle(private_prefix %+ _prep_8tap_8bpc_avx512icl).h2
 .hv:
+    _CET_ENDBR
     vpbroadcastd         m8, [pd_2]
     vpbroadcastd         m9, [pd_32]
     cmp                  wd, 4
@@ -3205,6 +3306,7 @@ cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, m
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     shr                 mxd, 16
     vpbroadcastd        m10, [base+subpel_filters+mxq*8+0]
     vpbroadcastd        m11, [base+subpel_filters+mxq*8+4]
@@ -3300,6 +3402,7 @@ cglobal prep_6tap_8bpc, 4, 8, 0, tmp, src, ss, w, h, m
     vzeroupper
     RET
 .hv_w16:
+    _CET_ENDBR
     mova                m16, [spel_h_perm16]
     vpbroadcastd        m18, [pb_4]
     add                  wd, wd
@@ -3440,6 +3543,7 @@ cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, 
     test                myd, 0xf00
     jz mangle(private_prefix %+ _prep_6tap_8bpc_avx512icl).prep
 .v:
+    _CET_ENDBR
     movzx               mxd, myb ; Select 4-tap/8-tap filter multipliers.
     shr                 myd, 16  ; Note that the code is 8-tap only, having
     cmp                  hd, 4   ; a separate 4-tap code path for (4|8|16)x4
@@ -3457,6 +3561,7 @@ cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, 
     vpbroadcastw        m11, [myq+6]
     jmp                  r5
 .v_w4:
+    _CET_ENDBR
     movd               xmm0, [srcq+strideq*0]
     vpbroadcastd       ymm1, [srcq+strideq*2]
     vpbroadcastd       xmm2, [srcq+strideq*1]
@@ -3502,6 +3607,7 @@ cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, 
     vzeroupper
     RET
 .v_w8:
+    _CET_ENDBR
     mova                 m6, [spel_v_perm8]
     movq                xm1, [srcq+strideq*0]
     mov                 r6d, 0x3e
@@ -3542,6 +3648,7 @@ cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, 
     jg .v_w8_loop
     RET
 .v_w16:
+    _CET_ENDBR
     mova                m12, [spel_v_perm16b]
     vbroadcasti32x4      m1, [srcq+strideq*0]
     mov                 r6d, 0x0f
@@ -3593,6 +3700,7 @@ cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, 
     jg .v_w16_loop
     RET
 .v_w32:
+    _CET_ENDBR
     movshdup            m21, [bilin_v_perm64]
     movu               ym16, [srcq+strideq*0]
     movu               ym17, [srcq+strideq*1]
@@ -3651,6 +3759,7 @@ cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, 
     RET
 .v_w64:
 .v_w128:
+    _CET_ENDBR
     WIN64_SPILL_XMM      24
     mova                m23, [bilin_v_perm64]
     add                  wd, wd
@@ -3738,6 +3847,7 @@ cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, 
     jg .v_loop0
     RET
 .h:
+    _CET_ENDBR
     RESET_STACK_STATE
     test                myd, 0xf00
     jnz .hv
@@ -3754,6 +3864,7 @@ cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, 
     add                  wq, r7
     jmp                  wq
 .h_w4:
+    _CET_ENDBR
     movzx               mxd, mxb
     vbroadcasti128      ym5, [subpel_h_shufA]
     mov                 r3d, 0x4
@@ -3781,6 +3892,7 @@ cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, 
     jg .h_w4_loop
     RET
 .h_w8:
+    _CET_ENDBR
     vbroadcasti128       m5, [subpel_h_shufA]
     vbroadcasti128       m6, [subpel_h_shufB]
     vbroadcasti128       m7, [subpel_h_shufC]
@@ -3808,6 +3920,7 @@ cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, 
     jg .h_w8_loop
     RET
 .h_w16:
+    _CET_ENDBR
     mova                 m5, [spel_h_perm16]
     vpbroadcastd         m7, [pb_4]
     lea            stride3q, [strideq*3]
@@ -3825,6 +3938,7 @@ cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, 
     jg .h_w16_loop
     RET
 .h_w32:
+    _CET_ENDBR
     mova                 m5, [spel_h_perm32]
     vpbroadcastd         m7, [pb_4]
     paddb                m6, m7, m5
@@ -3839,9 +3953,11 @@ cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, 
     jg .h_w32_loop
     RET
 .h_w64:
+    _CET_ENDBR
     xor                 r6d, r6d
     jmp .h_start
 .h_w128:
+    _CET_ENDBR
     mov                  r6, -64*1
 .h_start:
     mova                 m5, [spel_h_perm32]
@@ -3863,6 +3979,7 @@ cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, 
     jg .h_loop0
     RET
 .hv:
+    _CET_ENDBR
     RESET_STACK_STATE
     vpbroadcastd         m8, [pd_2]
     vpbroadcastd         m9, [pd_32]
@@ -3938,6 +4055,7 @@ cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, 
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     shr                 mxd, 16
     sub                srcq, 3
     vpbroadcastd        m10, [base+subpel_filters+mxq*8+0]
@@ -4041,6 +4159,7 @@ cglobal prep_8tap_8bpc, 4, 8, 0, tmp, src, stride, w, 
     vzeroupper
     RET
 .hv_w16:
+    _CET_ENDBR
     WIN64_SPILL_XMM      23
     mova                m16, [spel_h_perm16]
     vpbroadcastd        m18, [pb_4]
@@ -4266,6 +4385,7 @@ ALIGN function_align
     ret
 ALIGN function_align
 .h:
+    _CET_ENDBR
     movu                xm5, [srcq+ssq*1]
     psrad              ym16, ym18, 10
     lea                srcq, [srcq+ssq*2]
@@ -4291,6 +4411,7 @@ ALIGN function_align
     lea            stride3q, [strideq*3]
     jmp                  wq
 .w4:
+    _CET_ENDBR
     cmp                  hd, 8
     jg .w4_h16
     WRAP_YMM %1           0
@@ -4315,6 +4436,7 @@ ALIGN function_align
     vpscatterdd [dstq+m7]{k1}, m0
     RET
 .w8:
+    _CET_ENDBR
     cmp                  hd, 4
     jne .w8_h8
     WRAP_YMM %1           0
@@ -4348,6 +4470,7 @@ ALIGN function_align
     %1_INC_PTR            2
     lea                dstq, [dstq+strideq*4]
 .w16:
+    _CET_ENDBR
     %1                    0
     vpermq               m0, m0, q3120
     mova          [dstq          ], xm0
@@ -4358,6 +4481,7 @@ ALIGN function_align
     jg .w16_loop
     RET
 .w32:
+    _CET_ENDBR
     pmovzxbq             m7, [pb_02461357]
 .w32_loop:
     %1                    0
@@ -4370,6 +4494,7 @@ ALIGN function_align
     jg .w32_loop
     RET
 .w64:
+    _CET_ENDBR
     pmovzxbq             m7, [pb_02461357]
 .w64_loop:
     %1                    0
@@ -4381,6 +4506,7 @@ ALIGN function_align
     jg .w64_loop
     RET
 .w128:
+    _CET_ENDBR
     pmovzxbq             m7, [pb_02461357]
 .w128_loop:
     %1                    0
@@ -4552,6 +4678,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, 
     lea            stride3q, [strideq*3]
     jmp                  wq
 .w4:
+    _CET_ENDBR
     mova                 m5, [wm_420_perm4]
     cmp                  hd, 8
     jg .w4_h16
@@ -4586,6 +4713,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, 
     vpscatterdd [dstq+m11]{k1}, m0
     RET
 .w8:
+    _CET_ENDBR
     mova                 m5, [wm_420_perm8]
     cmp                  hd, 4
     jne .w8_h8
@@ -4629,6 +4757,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, 
     jg .w8_loop
     RET
 .w16:
+    _CET_ENDBR
     mova                 m5, [wm_420_perm16]
 .w16_loop:
     W_MASK                0, 4, 0, 1
@@ -4650,6 +4779,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, 
     jg .w16_loop
     RET
 .w32:
+    _CET_ENDBR
     pmovzxbq             m5, [pb_02461357]
 .w32_loop:
     W_MASK                0, 4, 0, 1
@@ -4668,6 +4798,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, 
     jg .w32_loop
     RET
 .w64:
+    _CET_ENDBR
     pmovzxbq            m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14
     psrlq               m13, m12, 4          ; 1, 3, 5, 7, 9, 11, 13, 15
 .w64_loop:
@@ -4692,6 +4823,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1, 
     jg .w64_loop
     RET
 .w128:
+    _CET_ENDBR
     pmovzxbq            m14, [wm_420_perm64]
     mova                m10, [wm_420_mask]
     psrlq               m15, m14, 4
@@ -4746,6 +4878,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, 
     lea            stride3q, [strideq*3]
     jmp                  wq
 .w4:
+    _CET_ENDBR
     cmp                  hd, 8
     jg .w4_h16
     WRAP_YMM W_MASK       0, 4, 0, 1
@@ -4779,6 +4912,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, 
     vpscatterdd [dstq+m5]{k1}, m0
     RET
 .w8:
+    _CET_ENDBR
     cmp                  hd, 4
     jne .w8_h8
     WRAP_YMM W_MASK       0, 4, 0, 1
@@ -4826,6 +4960,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, 
     add               maskq, 32
     lea                dstq, [dstq+strideq*4]
 .w16:
+    _CET_ENDBR
     W_MASK                0, 4, 0, 1
     mova                 m1, m8
     vpdpwssd             m1, m4, m9
@@ -4841,6 +4976,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, 
     jg .w16_loop
     RET
 .w32:
+    _CET_ENDBR
     pmovzxbq             m5, [pb_02461357]
 .w32_loop:
     W_MASK                0, 4, 0, 1
@@ -4860,6 +4996,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, 
     jg .w32_loop
     RET
 .w64:
+    _CET_ENDBR
     pmovzxbq             m5, [pb_02461357]
 .w64_loop:
     W_MASK                0, 4, 0, 1
@@ -4878,6 +5015,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1, 
     jg .w64_loop
     RET
 .w128:
+    _CET_ENDBR
     pmovzxbq            m13, [pb_02461357]
 .w128_loop:
     W_MASK                0, 4, 0, 1
@@ -4916,6 +5054,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, 
     lea            stride3q, [strideq*3]
     jmp                  wq
 .w4:
+    _CET_ENDBR
     cmp                  hd, 8
     jg .w4_h16
     WRAP_YMM W_MASK       0, 4, 0, 1, 1
@@ -4945,6 +5084,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, 
     vpscatterdd [dstq+m9]{k1}, m0
     RET
 .w8:
+    _CET_ENDBR
     cmp                  hd, 4
     jne .w8_h8
     WRAP_YMM W_MASK       0, 4, 0, 1, 1
@@ -4987,6 +5127,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, 
     add               maskq, 64
     lea                dstq, [dstq+strideq*4]
 .w16:
+    _CET_ENDBR
     W_MASK                0, 4, 0, 1, 1
     vpermb               m4, m8, m4
     vpermq               m0, m0, q3120
@@ -4999,6 +5140,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, 
     jg .w16_loop
     RET
 .w32:
+    _CET_ENDBR
     pmovzxbq             m9, [pb_02461357]
 .w32_loop:
     W_MASK                0, 4, 0, 1, 1
@@ -5015,6 +5157,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, 
     jg .w32_loop
     RET
 .w64:
+    _CET_ENDBR
     pmovzxbq             m9, [pb_02461357]
 .w64_loop:
     W_MASK                0, 4, 0, 1, 1
@@ -5030,6 +5173,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1, 
     jg .w64_loop
     RET
 .w128:
+    _CET_ENDBR
     pmovzxbq            m11, [pb_02461357]
 .w128_loop:
     W_MASK                0, 4, 0, 1, 1
@@ -5064,6 +5208,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
     lea                  r6, [dsq*3]
     jmp                  wq
 .w4:
+    _CET_ENDBR
     movd               xmm0, [dstq+dsq*0]
     pinsrd             xmm0, [dstq+dsq*1], 1
     vpbroadcastd       xmm1, [dstq+dsq*2]
@@ -5090,6 +5235,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
     jg .w4
     RET
 .w8:
+    _CET_ENDBR
     movq               xmm0, [dstq+dsq*0]
     vpbroadcastq       xmm1, [dstq+dsq*1]
     vpbroadcastq       ymm2, [dstq+dsq*2]
@@ -5120,6 +5266,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
     vzeroupper
     RET
 .w16:
+    _CET_ENDBR
     mova                xm1, [dstq+dsq*0]
     vinserti32x4        ym1, [dstq+dsq*1], 1
     vinserti32x4         m1, [dstq+dsq*2], 2
@@ -5146,6 +5293,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
     jg .w16
     RET
 .w32:
+    _CET_ENDBR
     mova                ym1, [dstq+dsq*0]
     vinserti32x8         m1, [dstq+dsq*1], 1
     mova                 m4, [maskq]
@@ -5179,6 +5327,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
     add               maskq, obmc_masks-blend_v_avx512icl_table
     jmp                  wq
 .w2:
+    _CET_ENDBR
     vpbroadcastd       xmm2, [maskq+2*2]
 .w2_s0_loop:
     movd               xmm0, [dstq+dsq*0]
@@ -5196,6 +5345,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
     jg .w2_s0_loop
     RET
 .w4:
+    _CET_ENDBR
     vpbroadcastq       xmm2, [maskq+4*2]
 .w4_loop:
     movd               xmm0, [dstq+dsq*0]
@@ -5213,6 +5363,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
     jg .w4_loop
     RET
 .w8:
+    _CET_ENDBR
     mova               xmm3, [maskq+8*2]
 .w8_loop:
     movq               xmm0, [dstq+dsq*0]
@@ -5233,6 +5384,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
     jg .w8_loop
     RET
 .w16:
+    _CET_ENDBR
     vbroadcasti32x4     ym3, [maskq+16*2]
     vbroadcasti32x4     ym4, [maskq+16*3]
 .w16_loop:
@@ -5254,6 +5406,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
     jg .w16_loop
     RET
 .w32:
+    _CET_ENDBR
     mova                 m4, [maskq+32*2]
     vshufi32x4           m3, m4, m4, q2020
     vshufi32x4           m4, m4, q3131
@@ -5291,6 +5444,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
     neg                  hq
     jmp                  wq
 .w2:
+    _CET_ENDBR
     movd               xmm0, [dstq+dsq*0]
     pinsrw             xmm0, [dstq+dsq*1], 1
     movd               xmm2, [maskq+hq*2]
@@ -5308,6 +5462,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
     jl .w2
     RET
 .w4:
+    _CET_ENDBR
     mova               xmm3, [blend_shuf]
 .w4_loop:
     movd               xmm0, [dstq+dsq*0]
@@ -5327,6 +5482,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
     jl .w4_loop
     RET
 .w8:
+    _CET_ENDBR
     vbroadcasti128     ymm4, [blend_shuf]
     shufpd             ymm4, ymm4, 0x03
 .w8_loop:
@@ -5351,6 +5507,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
     vzeroupper
     RET
 .w16:
+    _CET_ENDBR
     vbroadcasti32x4     ym4, [blend_shuf]
     shufpd              ym4, ym4, 0x0c
 .w16_loop:
@@ -5374,6 +5531,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
     jl .w16_loop
     RET
 .w32:
+    _CET_ENDBR
     vbroadcasti32x4      m4, [blend_shuf]
     shufpd               m4, m4, 0xf0
 .w32_loop:
@@ -5397,6 +5555,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
     jl .w32_loop
     RET
 .w64:
+    _CET_ENDBR
     vpbroadcastw         m3, [maskq+hq*2]
     mova                 m1, [dstq]
     mova                 m2, [tmpq]
@@ -5414,6 +5573,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
     jl .w64
     RET
 .w128:
+    _CET_ENDBR
     vpbroadcastw         m6, [maskq+hq*2]
     mova                 m2, [dstq+64*0]
     mova                 m1, [tmpq+64*0]
