Index: src/x86/mc_avx2.asm
--- src/x86/mc_avx2.asm.orig
+++ src/x86/mc_avx2.asm
@@ -214,10 +214,12 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     test               mxyd, mxyd
     jnz .v
 .put:
+    _CET_ENDBR
     movzx                wd, word [r7+wq*2+table_offset(put,)]
     add                  wq, r7
     jmp                  wq
 .put_w2:
+    _CET_ENDBR
     movzx               r6d, word [srcq+ssq*0]
     movzx               r7d, word [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -228,6 +230,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .put_w2
     RET
 .put_w4:
+    _CET_ENDBR
     mov                 r6d, [srcq+ssq*0]
     mov                 r7d, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -238,6 +241,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .put_w4
     RET
 .put_w8:
+    _CET_ENDBR
     mov                  r6, [srcq+ssq*0]
     mov                  r7, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -248,6 +252,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     jg .put_w8
     RET
 .put_w16:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0]
     movu                 m1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -259,6 +264,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, 
     RET
 INIT_YMM avx2
 .put_w32:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0]
     movu                 m1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -269,6 +275,7 @@ INIT_YMM avx2
     jg .put_w32
     RET
 .put_w64:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0+32*0]
     movu                 m1, [srcq+ssq*0+32*1]
     movu                 m2, [srcq+ssq*1+32*0]
@@ -283,6 +290,7 @@ INIT_YMM avx2
     jg .put_w64
     RET
 .put_w128:
+    _CET_ENDBR
     movu                 m0, [srcq+32*0]
     movu                 m1, [srcq+32*1]
     movu                 m2, [srcq+32*2]
@@ -297,6 +305,7 @@ INIT_YMM avx2
     jg .put_w128
     RET
 .h:
+    _CET_ENDBR
     ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
     ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
     imul               mxyd, 255
@@ -312,6 +321,7 @@ INIT_YMM avx2
     add                  wq, r7
     jmp                  wq
 .h_w2:
+    _CET_ENDBR
     movd                xm0, [srcq+ssq*0]
     pinsrd              xm0, [srcq+ssq*1], 1
     lea                srcq, [srcq+ssq*2]
@@ -326,6 +336,7 @@ INIT_YMM avx2
     jg .h_w2
     RET
 .h_w4:
+    _CET_ENDBR
     mova                xm4, [bilin_h_shuf4]
 .h_w4_loop:
     movq                xm0, [srcq+ssq*0]
@@ -342,6 +353,7 @@ INIT_YMM avx2
     jg .h_w4_loop
     RET
 .h_w8:
+    _CET_ENDBR
     movu                xm0, [srcq+ssq*0]
     movu                xm1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -359,6 +371,7 @@ INIT_YMM avx2
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     movu                xm0, [srcq+ssq*0+8*0]
     vinserti128          m0, [srcq+ssq*1+8*0], 1
     movu                xm1, [srcq+ssq*0+8*1]
@@ -378,6 +391,7 @@ INIT_YMM avx2
     jg .h_w16
     RET
 .h_w32:
+    _CET_ENDBR
     movu                 m0, [srcq+8*0]
     movu                 m1, [srcq+8*1]
     add                srcq, ssq
@@ -394,6 +408,7 @@ INIT_YMM avx2
     jg .h_w32
     RET
 .h_w64:
+    _CET_ENDBR
     movu                 m0, [srcq+8*0]
     movu                 m1, [srcq+8*1]
     pshufb               m0, m4
@@ -420,6 +435,7 @@ INIT_YMM avx2
     jg .h_w64
     RET
 .h_w128:
+    _CET_ENDBR
     mov                  r6, -32*3
 .h_w128_loop:
     movu                 m0, [srcq+r6+32*3+8*0]
@@ -440,6 +456,7 @@ INIT_YMM avx2
     jg .h_w128
     RET
 .v:
+    _CET_ENDBR
     movzx                wd, word [r7+wq*2+table_offset(put, _bilin_v)]
     imul               mxyd, 255
     vpbroadcastd         m5, [pw_2048]
@@ -449,6 +466,7 @@ INIT_YMM avx2
     vpbroadcastw         m4, xm4
     jmp                  wq
 .v_w2:
+    _CET_ENDBR
     movd                xm0,      [srcq+ssq*0]
 .v_w2_loop:
     pinsrw              xm1, xm0, [srcq+ssq*1], 1 ; 0 1
@@ -466,6 +484,7 @@ INIT_YMM avx2
     jg .v_w2_loop
     RET
 .v_w4:
+    _CET_ENDBR
     movd                xm0, [srcq+ssq*0]
 .v_w4_loop:
     vpbroadcastd        xm2, [srcq+ssq*1]
@@ -484,6 +503,7 @@ INIT_YMM avx2
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     movq                xm0, [srcq+ssq*0]
 .v_w8_loop:
     movq                xm2, [srcq+ssq*1]
@@ -503,6 +523,7 @@ INIT_YMM avx2
     jg .v_w8_loop
     RET
 .v_w16:
+    _CET_ENDBR
     movu                xm0, [srcq+ssq*0]
 .v_w16_loop:
     vbroadcasti128       m3, [srcq+ssq*1]
@@ -524,6 +545,7 @@ INIT_YMM avx2
     jg .v_w16_loop
     RET
 .v_w32:
+    _CET_ENDBR
 %macro PUT_BILIN_V_W32 0
     movu                 m0, [srcq+ssq*0]
 %%loop:
@@ -553,6 +575,7 @@ INIT_YMM avx2
     PUT_BILIN_V_W32
     RET
 .v_w64:
+    _CET_ENDBR
     movu                 m0, [srcq+32*0]
     movu                 m1, [srcq+32*1]
 .v_w64_loop:
@@ -582,6 +605,7 @@ INIT_YMM avx2
     jg .v_w64_loop
     RET
 .v_w128:
+    _CET_ENDBR
     lea                 r6d, [hq+(3<<8)]
     mov                  r4, srcq
     mov                  r7, dstq
@@ -596,6 +620,7 @@ INIT_YMM avx2
     jg .v_w128_loop
     RET
 .hv:
+    _CET_ENDBR
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
     ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
     movzx                wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
@@ -608,6 +633,7 @@ INIT_YMM avx2
     vpbroadcastw         m6, xm6
     jmp                  wq
 .hv_w2:
+    _CET_ENDBR
     vpbroadcastd        xm0, [srcq+ssq*0]
     pshufb              xm0, xm4
     pmaddubsw           xm0, xm5
@@ -632,6 +658,7 @@ INIT_YMM avx2
     jg .hv_w2_loop
     RET
 .hv_w4:
+    _CET_ENDBR
     mova                xm4, [bilin_h_shuf4]
     movddup             xm0, [srcq+ssq*0]
     pshufb              xm0, xm4
@@ -657,6 +684,7 @@ INIT_YMM avx2
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     vbroadcasti128       m0, [srcq+ssq*0]
     pshufb               m0, m4
     pmaddubsw            m0, m5
@@ -682,6 +710,7 @@ INIT_YMM avx2
     jg .hv_w8_loop
     RET
 .hv_w16:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0+8*0]
     vinserti128          m0, [srcq+ssq*0+8*1], 1
     pshufb               m0, m4
@@ -715,14 +744,17 @@ INIT_YMM avx2
     jg .hv_w16_loop
     RET
 .hv_w128:
+    _CET_ENDBR
     lea                 r6d, [hq+(3<<16)]
     jmp .hv_w32_start
 .hv_w64:
+    _CET_ENDBR
     lea                 r6d, [hq+(1<<16)]
 .hv_w32_start:
     mov                  r4, srcq
     mov                  r7, dstq
 .hv_w32:
+    _CET_ENDBR
 %if WIN64
     movaps              r4m, xmm8
 %endif
@@ -781,11 +813,13 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     test               mxyd, mxyd
     jnz .v
 .prep:
+    _CET_ENDBR
     movzx                wd, word [r6+wq*2+table_offset(prep,)]
     add                  wq, r6
     lea            stride3q, [strideq*3]
     jmp                  wq
 .prep_w4:
+    _CET_ENDBR
     movd                xm0, [srcq+strideq*0]
     pinsrd              xm0, [srcq+strideq*1], 1
     pinsrd              xm0, [srcq+strideq*2], 2
@@ -799,6 +833,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .prep_w4
     RET
 .prep_w8:
+    _CET_ENDBR
     movq                xm0, [srcq+strideq*0]
     movhps              xm0, [srcq+strideq*1]
     movq                xm1, [srcq+strideq*2]
@@ -815,6 +850,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .prep_w8
     RET
 .prep_w16:
+    _CET_ENDBR
     pmovzxbw             m0, [srcq+strideq*0]
     pmovzxbw             m1, [srcq+strideq*1]
     pmovzxbw             m2, [srcq+strideq*2]
@@ -833,6 +869,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .prep_w16
     RET
 .prep_w32:
+    _CET_ENDBR
     pmovzxbw             m0, [srcq+strideq*0+16*0]
     pmovzxbw             m1, [srcq+strideq*0+16*1]
     pmovzxbw             m2, [srcq+strideq*1+16*0]
@@ -851,6 +888,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .prep_w32
     RET
 .prep_w64:
+    _CET_ENDBR
     pmovzxbw             m0, [srcq+16*0]
     pmovzxbw             m1, [srcq+16*1]
     pmovzxbw             m2, [srcq+16*2]
@@ -869,6 +907,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .prep_w64
     RET
 .prep_w128:
+    _CET_ENDBR
     pmovzxbw             m0, [srcq+16*0]
     pmovzxbw             m1, [srcq+16*1]
     pmovzxbw             m2, [srcq+16*2]
@@ -899,6 +938,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .prep_w128
     RET
 .h:
+    _CET_ENDBR
     ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
     ; = (16 - mx) * src[x] + mx * src[x + 1]
     imul               mxyd, 255
@@ -914,6 +954,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     lea            stride3q, [strideq*3]
     jmp                  wq
 .h_w4:
+    _CET_ENDBR
     vbroadcasti128       m4, [bilin_h_shuf4]
 .h_w4_loop:
     movq                xm0, [srcq+strideq*0]
@@ -930,6 +971,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .h_w4_loop
     RET
 .h_w8:
+    _CET_ENDBR
 .h_w8_loop:
     movu                xm0, [srcq+strideq*0]
     vinserti128          m0, [srcq+strideq*1], 1
@@ -947,6 +989,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .h_w8_loop
     RET
 .h_w16:
+    _CET_ENDBR
 .h_w16_loop:
     movu                xm0, [srcq+strideq*0+8*0]
     vinserti128          m0, [srcq+strideq*0+8*1], 1
@@ -974,6 +1017,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .h_w16_loop
     RET
 .h_w32:
+    _CET_ENDBR
 .h_w32_loop:
     movu                xm0, [srcq+strideq*0+8*0]
     vinserti128          m0, [srcq+strideq*0+8*1], 1
@@ -1001,6 +1045,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .h_w32_loop
     RET
 .h_w64:
+    _CET_ENDBR
     movu                xm0, [srcq+8*0]
     vinserti128          m0, [srcq+8*1], 1
     movu                xm1, [srcq+8*2]
@@ -1027,6 +1072,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .h_w64
     RET
 .h_w128:
+    _CET_ENDBR
     movu                xm0, [srcq+8*0]
     vinserti128          m0, [srcq+8*1], 1
     movu                xm1, [srcq+8*2]
@@ -1073,6 +1119,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .h_w128
     RET
 .v:
+    _CET_ENDBR
     WIN64_SPILL_XMM       7
     movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
     imul               mxyd, 255
@@ -1083,6 +1130,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     vpbroadcastw         m6, xm6
     jmp                  wq
 .v_w4:
+    _CET_ENDBR
     movd                xm0, [srcq+strideq*0]
 .v_w4_loop:
     vpbroadcastd         m1, [srcq+strideq*2]
@@ -1103,6 +1151,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     movq                xm0, [srcq+strideq*0]
 .v_w8_loop:
     vpbroadcastq         m1, [srcq+strideq*2]
@@ -1126,6 +1175,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .v_w8_loop
     RET
 .v_w16:
+    _CET_ENDBR
     vbroadcasti128       m0, [srcq+strideq*0]
 .v_w16_loop:
     vbroadcasti128       m1, [srcq+strideq*1]
@@ -1153,6 +1203,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .v_w16_loop
     RET
 .v_w32:
+    _CET_ENDBR
     vpermq               m0, [srcq+strideq*0], q3120
 .v_w32_loop:
     vpermq               m1, [srcq+strideq*1], q3120
@@ -1189,6 +1240,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .v_w32_loop
     RET
 .v_w64:
+    _CET_ENDBR
     vpermq               m0, [srcq+strideq*0+32*0], q3120
     vpermq               m1, [srcq+strideq*0+32*1], q3120
 .v_w64_loop:
@@ -1226,6 +1278,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .v_w64_loop
     RET
 .v_w128:
+    _CET_ENDBR
     lea                 r6d, [hq+(3<<8)]
     mov                  r3, srcq
     mov                  r5, tmpq
@@ -1259,6 +1312,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .v_w128_loop0
     RET
 .hv:
+    _CET_ENDBR
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
     ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
     WIN64_SPILL_XMM       7
@@ -1270,6 +1324,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     lea            stride3q, [strideq*3]
     jmp                  wq
 .hv_w4:
+    _CET_ENDBR
     vbroadcasti128       m4, [bilin_h_shuf4]
     vpbroadcastq         m0, [srcq+strideq*0]
     pshufb               m0, m4
@@ -1295,6 +1350,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     vbroadcasti128       m0, [srcq+strideq*0]
     pshufb               m0, m4
     pmaddubsw            m0, m5
@@ -1323,6 +1379,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .hv_w8_loop
     RET
 .hv_w16:
+    _CET_ENDBR
     movu                xm0, [srcq+strideq*0+8*0]
     vinserti128          m0, [srcq+strideq*0+8*1], 1
     pshufb               m0, m4
@@ -1350,6 +1407,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .hv_w16_loop
     RET
 .hv_w32:
+    _CET_ENDBR
     movu                xm0, [srcq+8*0]
     vinserti128          m0, [srcq+8*1], 1
     movu                xm1, [srcq+8*2]
@@ -1383,10 +1441,12 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .hv_w32_loop
     RET
 .hv_w128:
+    _CET_ENDBR
     lea                 r3d, [hq+(7<<8)]
     mov                 r6d, 256
     jmp .hv_w64_start
 .hv_w64:
+    _CET_ENDBR
     lea                 r3d, [hq+(3<<8)]
     mov                 r6d, 128
 .hv_w64_start:
@@ -1476,6 +1536,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     test                myd, 0xf00
     jnz .v
 .put:
+    _CET_ENDBR
     tzcnt                wd, wd
     movzx                wd, word [r8+wq*2+table_offset(put,)]
     add                  wq, r8
@@ -1486,6 +1547,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
 %endif
     jmp                  wq
 .h_w2:
+    _CET_ENDBR
     movzx               mxd, mxb
     lea                srcq, [srcq-1]
     vpbroadcastd        xm4, [r8+mxq*8+subpel_filters-put_avx2+2]
@@ -1508,6 +1570,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .h_w2_loop
     RET
 .h_w4:
+    _CET_ENDBR
     mova                xm3, [subpel_h_shufA]
 .h_w4_loop:
     movq                xm0, [srcq+ssq*0]
@@ -1528,6 +1591,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .h_w4_loop
     RET
 .h:
+    _CET_ENDBR
     test                myd, 0xf00
     jnz .hv
     vpbroadcastd         m5, [pw_34] ; 2 + (8 << 2)
@@ -1548,6 +1612,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     vpbroadcastw        m10, [mxq+4]
     jmp                  wq
 .h_w8:
+    _CET_ENDBR
 %macro PUT_6TAP_H 3 ; dst/src, tmp[1-2]
     pshufb              m%2, m%1, m4
     pmaddubsw           m%2, m8
@@ -1573,6 +1638,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     movu                xm0, [srcq+ssq*0+8*0]
     vinserti128          m0, [srcq+ssq*1+8*0], 1
     movu                xm1, [srcq+ssq*0+8*1]
@@ -1588,12 +1654,15 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .h_w16
     RET
 .h_w32:
+    _CET_ENDBR
     xor                 r6d, r6d
     jmp .h_start
 .h_w64:
+    _CET_ENDBR
     mov                  r6, -32*1
     jmp .h_start
 .h_w128:
+    _CET_ENDBR
     mov                  r6, -32*3
 .h_start:
     sub                srcq, r6
@@ -1615,6 +1684,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .h_loop
     RET
 .v:
+    _CET_ENDBR
     WIN64_SPILL_XMM       9, 12
     movzx               mxd, myb
     shr                 myd, 16
@@ -1632,6 +1702,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     neg                 nsq
     jmp                  r6
 .v_w2:
+    _CET_ENDBR
     movd                xm2, [srcq+nsq*2]
     pinsrw              xm2, [srcq+nsq*1], 2
     pinsrw              xm2, [srcq+ssq*0], 4
@@ -1663,6 +1734,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .v_w2_loop
     RET
 .v_w4:
+    _CET_ENDBR
     movd                xm2, [srcq+nsq*2]
     pinsrd              xm2, [srcq+nsq*1], 1
     pinsrd              xm2, [srcq+ssq*0], 2
@@ -1694,6 +1766,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     movq                xm1, [srcq+nsq*2]
     vpbroadcastq         m3, [srcq+nsq*1]
     vpbroadcastq         m2, [srcq+ssq*0]
@@ -1732,6 +1805,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
 .v_w32:
 .v_w64:
 .v_w128:
+    _CET_ENDBR
     lea                 r6d, [wq*8-128]
     WIN64_PUSH_XMM       12
     lea                 r6d, [hq+r6*2]
@@ -1786,6 +1860,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .v_w16_loop0
     RET
 .hv:
+    _CET_ENDBR
     WIN64_SPILL_XMM      12, 16
     cmp                  wd, 4
     jg .hv_w8
@@ -1854,6 +1929,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .hv_w2_loop
     RET
 .hv_w4:
+    _CET_ENDBR
     mova                 m5, [subpel_h_shuf4]
     vpbroadcastq         m2, [srcq+nsq*2]
     vpbroadcastq         m4, [srcq+nsq*1]
@@ -1907,6 +1983,7 @@ cglobal put_6tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     shr                 mxd, 16
     sub                srcq, 2
     lea                 mxq, [r8+mxq*8+subpel_filters+1-put_avx2]
@@ -2029,6 +2106,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     test                myd, 0xf00
     jz mangle(private_prefix %+ _put_6tap_8bpc_avx2).put
 .v:
+    _CET_ENDBR
     WIN64_SPILL_XMM      12, 15
     movzx               mxd, myb
     shr                 myd, 16
@@ -2047,6 +2125,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     sub                srcq, ss3q
     jmp                  r6
 .v_w2:
+    _CET_ENDBR
     movd                xm2, [srcq+ssq*0]
     pinsrw              xm2, [srcq+ssq*1], 2
     pinsrw              xm2, [srcq+ssq*2], 4
@@ -2087,6 +2166,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .v_w2_loop
     RET
 .v_w4:
+    _CET_ENDBR
     movd                xm2, [srcq+ssq*0]
     pinsrd              xm2, [srcq+ssq*1], 1
     pinsrd              xm2, [srcq+ssq*2], 2
@@ -2127,6 +2207,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     movq                xm1, [srcq+ssq*0]
     vpbroadcastq         m4, [srcq+ssq*1]
     vpbroadcastq         m2, [srcq+ssq*2]
@@ -2174,6 +2255,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
 .v_w32:
 .v_w64:
 .v_w128:
+    _CET_ENDBR
     lea                 r6d, [wq*8-128]
     WIN64_PUSH_XMM       15
     lea                 r6d, [hq+r6*2]
@@ -2242,6 +2324,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
 .h:
 .h_w2:
 .h_w4:
+    _CET_ENDBR
     test                myd, 0xf00
     jnz .hv
     vpbroadcastd         m5, [pw_34] ; 2 + (8 << 2)
@@ -2260,6 +2343,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     add                  wq, r8
     jmp                  wq
 .h_w8:
+    _CET_ENDBR
 %macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
     pshufb              m%2, m%1, m7
     pshufb              m%3, m%1, m8
@@ -2287,6 +2371,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     movu                xm0, [srcq+ssq*0+8*0]
     vinserti128          m0, [srcq+ssq*1+8*0], 1
     movu                xm1, [srcq+ssq*0+8*1]
@@ -2302,12 +2387,15 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .h_w16
     RET
 .h_w32:
+    _CET_ENDBR
     xor                 r6d, r6d
     jmp .h_start
 .h_w64:
+    _CET_ENDBR
     mov                  r6, -32*1
     jmp .h_start
 .h_w128:
+    _CET_ENDBR
     mov                  r6, -32*3
 .h_start:
     sub                srcq, r6
@@ -2329,6 +2417,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .h_loop
     RET
 .hv:
+    _CET_ENDBR
     WIN64_SPILL_XMM      14, 16
     cmp                  wd, 4
     jg .hv_w8
@@ -2408,6 +2497,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .hv_w2_loop
     RET
 .hv_w4:
+    _CET_ENDBR
     mova                 m6, [subpel_h_shuf4]
     vpbroadcastq         m2, [srcq+ssq*0]
     vpbroadcastq         m4, [srcq+ssq*1]
@@ -2472,6 +2562,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     WIN64_PUSH_XMM       16
     shr                 mxd, 16
     sub                srcq, 3
@@ -2618,6 +2709,7 @@ cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, m
     test                myd, 0xf00
     jnz .v
 .prep:
+    _CET_ENDBR
     tzcnt                wd, wd
     movzx                wd, word [r7+wq*2+table_offset(prep,)]
     add                  wq, r7
@@ -2627,6 +2719,7 @@ cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, m
 %endif
     jmp                  wq
 .v:
+    _CET_ENDBR
     WIN64_SPILL_XMM      10, 12
     movzx               mxd, myb
     shr                 myd, 16
@@ -2643,6 +2736,7 @@ cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, m
     jg .v_w16
     je .v_w8
 .v_w4:
+    _CET_ENDBR
     movd                xm2, [srcq+nsq*2]
     pinsrd              xm2, [srcq+nsq*1], 1
     vpbroadcastd         m1, [srcq+ssq*0]
@@ -2677,6 +2771,7 @@ cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, m
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     movq                xm1, [srcq+nsq*2]
     vpbroadcastq         m3, [srcq+nsq*1]
     vpbroadcastq         m2, [srcq+ssq*0]
@@ -2719,6 +2814,7 @@ cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, m
     jg .v_w8_loop
     RET
 .v_w16:
+    _CET_ENDBR
     lea                 r6d, [wq*2-32]
     lea                srcq, [srcq+nsq*2]
     WIN64_PUSH_XMM       12
@@ -2773,6 +2869,7 @@ cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, m
     jg .v_w16_loop0
     RET
 .h_w4:
+    _CET_ENDBR
     RESET_STACK_STATE
     movzx               mxd, mxb
     vbroadcasti128       m3, [subpel_h_shufA]
@@ -2799,6 +2896,7 @@ cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, m
     jg .h_w4_loop
     RET
 .h:
+    _CET_ENDBR
     test                myd, 0xf00
     jnz .hv
     vpbroadcastd         m4, [pw_8192]
@@ -2819,6 +2917,7 @@ cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, m
     vpbroadcastw         m9, [mxq+4]
     jmp                  wq
 .h_w8:
+    _CET_ENDBR
     movu                xm0, [srcq+ssq*0]
     vinserti128          m0, [srcq+ssq*1], 1
     lea                srcq, [srcq+ssq*2]
@@ -2840,6 +2939,7 @@ cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, m
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     movu                xm0, [srcq+ssq*0+8*0]
     vinserti128          m0, [srcq+ssq*0+8*1], 1
     PREP_6TAP_H
@@ -2854,12 +2954,15 @@ cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, m
     jg .h_w16
     RET
 .h_w32:
+    _CET_ENDBR
     xor                 r6d, r6d
     jmp .h_start
 .h_w64:
+    _CET_ENDBR
     mov                  r6, -32*1
     jmp .h_start
 .h_w128:
+    _CET_ENDBR
     mov                  r6, -32*3
 .h_start:
     sub                srcq, r6
@@ -2882,10 +2985,12 @@ cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, m
     jg .h_loop
     RET
 .hv:
+    _CET_ENDBR
     WIN64_SPILL_XMM      14, 16
     cmp                  wd, 4
     jne .hv_w8
 .hv_w4:
+    _CET_ENDBR
     movzx               mxd, mxb
     dec                srcq
     vpbroadcastd         m7, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
@@ -2965,6 +3070,7 @@ cglobal prep_6tap_8bpc, 3, 8, 0, tmp, src, ss, w, h, m
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     shr                 mxd, 16
     lea                 mxq, [r7+mxq*8+subpel_filters+1-prep_avx2]
     WIN64_PUSH_XMM       16
@@ -3075,6 +3181,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, 
     test                myd, 0xf00
     jz mangle(private_prefix %+ _prep_6tap_8bpc_avx2).prep
 .v:
+    _CET_ENDBR
     WIN64_SPILL_XMM      12, 15
     movzx               mxd, myb ; Select 4-tap/8-tap filter multipliers.
     shr                 myd, 16  ; Note that the code is 8-tap only, having
@@ -3092,6 +3199,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, 
     jg .v_w16
     je .v_w8
 .v_w4:
+    _CET_ENDBR
     movd                xm0, [srcq+strideq*0]
     vpbroadcastd         m1, [srcq+strideq*2]
     vpbroadcastd        xm2, [srcq+strideq*1]
@@ -3136,6 +3244,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, 
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     movq                xm1, [srcq+strideq*0]
     vpbroadcastq         m4, [srcq+strideq*1]
     vpbroadcastq         m2, [srcq+strideq*2]
@@ -3189,6 +3298,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, 
     jg .v_w8_loop
     RET
 .v_w16:
+    _CET_ENDBR
     lea                 r6d, [wq*2-32]
     WIN64_PUSH_XMM       15
     lea                 r6d, [hq+r6*8]
@@ -3255,6 +3365,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, 
     RET
 .h:
 .h_w4:
+    _CET_ENDBR
     test                myd, 0xf00
     jnz .hv
     vpbroadcastd         m4, [pw_8192]
@@ -3273,6 +3384,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, 
     add                  wq, r7
     jmp                  wq
 .h_w8:
+    _CET_ENDBR
     movu                xm0, [srcq+strideq*0]
     vinserti128          m0, [srcq+strideq*1], 1
     lea                srcq, [srcq+strideq*2]
@@ -3296,6 +3408,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, 
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     movu                xm0, [srcq+strideq*0+8*0]
     vinserti128          m0, [srcq+strideq*0+8*1], 1
     PREP_8TAP_H
@@ -3310,12 +3423,15 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, 
     jg .h_w16
     RET
 .h_w32:
+    _CET_ENDBR
     xor                 r6d, r6d
     jmp .h_start
 .h_w64:
+    _CET_ENDBR
     mov                  r6, -32*1
     jmp .h_start
 .h_w128:
+    _CET_ENDBR
     mov                  r6, -32*3
 .h_start:
     sub                srcq, r6
@@ -3338,6 +3454,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, 
     jg .h_loop
     RET
 .hv:
+    _CET_ENDBR
     WIN64_SPILL_XMM      16
     cmp                  wd, 4
     je .hv_w4
@@ -3360,6 +3477,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, 
     pshufd              m15, m0, q3333
     jmp .hv_w8
 .hv_w4:
+    _CET_ENDBR
     movzx               mxd, mxb
     dec                srcq
     vpbroadcastd         m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
@@ -3453,6 +3571,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, 
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     lea                 r6d, [wq*8-64]
     lea                 r6d, [hq+r6*4]
 .hv_w8_loop0:
@@ -3697,6 +3816,7 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
     jmp                  wq
 %ifidn %1, put
 .w2:
+    _CET_ENDBR
     mov                 myd, mym
     movzx               t0d, t0b
     dec                srcq
@@ -3807,6 +3927,7 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
     jmp .w2_loop
 %endif
 .w4:
+    _CET_ENDBR
     mov                 myd, mym
     vbroadcasti128       m7, [base+rescale_mul]
     movzx               t0d, t0b
@@ -3946,22 +4067,27 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
     lea                srcq, [srcq+ssq*2]
     jmp .w4_loop
 .w8:
+    _CET_ENDBR
     mov      dword [rsp+48], 1
     movifprep   tmp_stridem, 16
     jmp .w_start
 .w16:
+    _CET_ENDBR
     mov      dword [rsp+48], 2
     movifprep   tmp_stridem, 32
     jmp .w_start
 .w32:
+    _CET_ENDBR
     mov      dword [rsp+48], 4
     movifprep   tmp_stridem, 64
     jmp .w_start
 .w64:
+    _CET_ENDBR
     mov      dword [rsp+48], 8
     movifprep   tmp_stridem, 128
     jmp .w_start
 .w128:
+    _CET_ENDBR
     mov      dword [rsp+48], 16
     movifprep   tmp_stridem, 256
 .w_start:
@@ -4162,11 +4288,13 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
     pmulhrsw             m3, m12
     jmp .vloop
 .dy1:
+    _CET_ENDBR
     movzx                wd, word [base+%1_8tap_scaled_avx2_dy1_table+wq*2]
     add                  wq, base_reg
     jmp                  wq
 %ifidn %1, put
 .dy1_w2:
+    _CET_ENDBR
     mov                 myd, mym
     movzx               t0d, t0b
     dec                srcq
@@ -4254,6 +4382,7 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
     RET
 %endif
 .dy1_w4:
+    _CET_ENDBR
     mov                 myd, mym
     vbroadcasti128       m7, [base+rescale_mul]
     movzx               t0d, t0b
@@ -4365,22 +4494,27 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
     jg .dy1_w4_loop
     MC_8TAP_SCALED_RET
 .dy1_w8:
+    _CET_ENDBR
     mov      dword [rsp+72], 1
     movifprep   tmp_stridem, 16
     jmp .dy1_w_start
 .dy1_w16:
+    _CET_ENDBR
     mov      dword [rsp+72], 2
     movifprep   tmp_stridem, 32
     jmp .dy1_w_start
 .dy1_w32:
+    _CET_ENDBR
     mov      dword [rsp+72], 4
     movifprep   tmp_stridem, 64
     jmp .dy1_w_start
 .dy1_w64:
+    _CET_ENDBR
     mov      dword [rsp+72], 8
     movifprep   tmp_stridem, 128
     jmp .dy1_w_start
 .dy1_w128:
+    _CET_ENDBR
     mov      dword [rsp+72], 16
     movifprep   tmp_stridem, 256
 .dy1_w_start:
@@ -4526,11 +4660,13 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
     pblendw              m3, m4, 0xaa
     jmp .dy1_vloop
 .dy2:
+    _CET_ENDBR
     movzx                wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
     add                  wq, base_reg
     jmp                  wq
 %ifidn %1, put
 .dy2_w2:
+    _CET_ENDBR
     mov                 myd, mym
     movzx               t0d, t0b
     dec                srcq
@@ -4621,6 +4757,7 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
     RET
 %endif
 .dy2_w4:
+    _CET_ENDBR
     mov                 myd, mym
     vbroadcasti128       m7, [base+rescale_mul]
     movzx               t0d, t0b
@@ -4725,22 +4862,27 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
     jg .dy2_w4_loop
     MC_8TAP_SCALED_RET
 .dy2_w8:
+    _CET_ENDBR
     mov      dword [rsp+40], 1
     movifprep   tmp_stridem, 16
     jmp .dy2_w_start
 .dy2_w16:
+    _CET_ENDBR
     mov      dword [rsp+40], 2
     movifprep   tmp_stridem, 32
     jmp .dy2_w_start
 .dy2_w32:
+    _CET_ENDBR
     mov      dword [rsp+40], 4
     movifprep   tmp_stridem, 64
     jmp .dy2_w_start
 .dy2_w64:
+    _CET_ENDBR
     mov      dword [rsp+40], 8
     movifprep   tmp_stridem, 128
     jmp .dy2_w_start
 .dy2_w128:
+    _CET_ENDBR
     mov      dword [rsp+40], 16
     movifprep   tmp_stridem, 256
 .dy2_w_start:
@@ -5096,6 +5238,7 @@ ALIGN function_align
     ret
 ALIGN function_align
 .h:
+    _CET_ENDBR
     lea               tmp1d, [mxq+alphaq*4]
     lea               tmp2d, [mxq+alphaq*1]
     vbroadcasti128      m10, [srcq]
@@ -5138,6 +5281,7 @@ ALIGN function_align
     lea            stride3q, [strideq*3]
     jmp                  wq
 .w4:
+    _CET_ENDBR
     vextracti128        xm1, m0, 1
     movd   [dstq          ], xm0
     pextrd [dstq+strideq*1], xm0, 1
@@ -5171,6 +5315,7 @@ ALIGN function_align
     %1                    0
     lea                dstq, [dstq+strideq*4]
 .w8:
+    _CET_ENDBR
     vextracti128        xm1, m0, 1
     movq   [dstq          ], xm0
     movq   [dstq+strideq*1], xm1
@@ -5184,6 +5329,7 @@ ALIGN function_align
     %1                    0
     lea                dstq, [dstq+strideq*4]
 .w16:
+    _CET_ENDBR
     vpermq               m0, m0, q3120
     mova         [dstq          ], xm0
     vextracti128 [dstq+strideq*1], m0, 1
@@ -5199,6 +5345,7 @@ ALIGN function_align
     %1                    0
     lea                dstq, [dstq+strideq*2]
 .w32:
+    _CET_ENDBR
     vpermq               m0, m0, q3120
     mova   [dstq+strideq*0], m0
     %1                    2
@@ -5212,6 +5359,7 @@ ALIGN function_align
     %1                    0
     add                dstq, strideq
 .w64:
+    _CET_ENDBR
     vpermq               m0, m0, q3120
     mova             [dstq], m0
     %1                    2
@@ -5224,6 +5372,7 @@ ALIGN function_align
     %1                    0
     add                dstq, strideq
 .w128:
+    _CET_ENDBR
     vpermq               m0, m0, q3120
     mova        [dstq+0*32], m0
     %1                    2
@@ -5393,6 +5542,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
     lea                  r6, [dsq*3]
     jmp                  wq
 .w4:
+    _CET_ENDBR
     movd                xm0, [dstq+dsq*0]
     pinsrd              xm0, [dstq+dsq*1], 1
     vpbroadcastd        xm1, [dstq+dsq*2]
@@ -5420,6 +5570,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
     RET
 ALIGN function_align
 .w8:
+    _CET_ENDBR
     movq                xm1, [dstq+dsq*0]
     movhps              xm1, [dstq+dsq*1]
     vpbroadcastq         m2, [dstq+dsq*2]
@@ -5450,6 +5601,7 @@ ALIGN function_align
     RET
 ALIGN function_align
 .w16:
+    _CET_ENDBR
     mova                 m0, [maskq]
     mova                xm1, [dstq+dsq*0]
     vinserti128          m1, [dstq+dsq*1], 1
@@ -5473,6 +5625,7 @@ ALIGN function_align
     RET
 ALIGN function_align
 .w32:
+    _CET_ENDBR
     mova                 m0, [maskq]
     mova                 m1, [dstq]
     mova                 m6, [maskq+tmpq]
@@ -5504,6 +5657,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
     add               maskq, obmc_masks-blend_v_avx2_table
     jmp                  wq
 .w2:
+    _CET_ENDBR
     vpbroadcastd        xm2, [maskq+2*2]
 .w2_s0_loop:
     movd                xm0, [dstq+dsq*0]
@@ -5522,6 +5676,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
     RET
 ALIGN function_align
 .w4:
+    _CET_ENDBR
     vpbroadcastq        xm2, [maskq+4*2]
 .w4_loop:
     movd                xm0, [dstq+dsq*0]
@@ -5540,6 +5695,7 @@ ALIGN function_align
     RET
 ALIGN function_align
 .w8:
+    _CET_ENDBR
     mova                xm3, [maskq+8*2]
 .w8_loop:
     movq                xm0, [dstq+dsq*0]
@@ -5561,6 +5717,7 @@ ALIGN function_align
     RET
 ALIGN function_align
 .w16:
+    _CET_ENDBR
     vbroadcasti128       m3, [maskq+16*2]
     vbroadcasti128       m4, [maskq+16*3]
 .w16_loop:
@@ -5583,6 +5740,7 @@ ALIGN function_align
     RET
 ALIGN function_align
 .w32:
+    _CET_ENDBR
     mova                xm3, [maskq+16*4]
     vinserti128          m3, [maskq+16*6], 1
     mova                xm4, [maskq+16*5]
@@ -5620,6 +5778,7 @@ cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mas
     neg                  hq
     jmp                  wq
 .w2:
+    _CET_ENDBR
     movd                xm0, [dstq+dsq*0]
     pinsrw              xm0, [dstq+dsq*1], 1
     movd                xm2, [maskq+hq*2]
@@ -5638,6 +5797,7 @@ cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mas
     RET
 ALIGN function_align
 .w4:
+    _CET_ENDBR
     mova                xm3, [blend_shuf]
 .w4_loop:
     movd                xm0, [dstq+dsq*0]
@@ -5658,6 +5818,7 @@ ALIGN function_align
     RET
 ALIGN function_align
 .w8:
+    _CET_ENDBR
     vbroadcasti128       m4, [blend_shuf]
     shufpd               m4, m4, 0x03
 .w8_loop:
@@ -5682,6 +5843,7 @@ ALIGN function_align
     RET
 ALIGN function_align
 .w16:
+    _CET_ENDBR
     vbroadcasti128       m4, [blend_shuf]
     shufpd               m4, m4, 0x0c
 .w16_loop:
@@ -5706,6 +5868,7 @@ ALIGN function_align
     RET
 ALIGN function_align
 .w32: ; w32/w64/w128
+    _CET_ENDBR
     sub                 dsq, r6
 .w32_loop0:
     vpbroadcastw         m3, [maskq+hq*2]
@@ -6068,6 +6231,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, 
     lea            stride3q, [strideq*3]
     jmp                  wq
 .w4:
+    _CET_ENDBR
     vextracti128        xm1, m0, 1
     movd   [dstq+strideq*0], xm0
     pextrd [dstq+strideq*1], xm0, 1
@@ -6120,6 +6284,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, 
     lea                dstq, [dstq+strideq*4]
     add               maskq, 8
 .w8:
+    _CET_ENDBR
     vextracti128        xm2, m4, 1
     vextracti128        xm1, m0, 1
     psubw               xm4, xm8, xm4
@@ -6141,6 +6306,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, 
     lea                dstq, [dstq+strideq*4]
     add               maskq, 16
 .w16:
+    _CET_ENDBR
     vpermq               m0, m0, q3120
     mova         [dstq+strideq*0], xm0
     vextracti128 [dstq+strideq*1], m0, 1
@@ -6166,6 +6332,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, 
     lea                dstq, [dstq+strideq*2]
     add               maskq, 16
 .w32:
+    _CET_ENDBR
     vpermq               m0, m0, q3120
     mova   [dstq+strideq*0], m0
     W_MASK                0, 5, 2, 3
@@ -6190,6 +6357,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, 
     W_MASK                0, 4, 0, 1
     add                dstq, strideq
 .w64:
+    _CET_ENDBR
     vpermq               m0, m0, q3120
     mova        [dstq+32*0], m0
     W_MASK                0, 5, 2, 3
@@ -6216,6 +6384,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, 
     W_MASK                0, 4, 0, 1
     add                dstq, strideq
 .w128:
+    _CET_ENDBR
     vpermq               m0, m0, q3120
     mova        [dstq+32*0], m0
     W_MASK                0, 5, 2, 3
@@ -6275,6 +6444,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, 
     lea            stride3q, [strideq*3]
     jmp                  wq
 .w4:
+    _CET_ENDBR
     vextracti128        xm1, m0, 1
     movd   [dstq+strideq*0], xm0
     pextrd [dstq+strideq*1], xm0, 1
@@ -6322,6 +6492,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, 
     lea                dstq, [dstq+strideq*4]
     add               maskq, 16
 .w8:
+    _CET_ENDBR
     vextracti128        xm5, m4, 1
     vextracti128        xm1, m0, 1
     packuswb            xm4, xm5
@@ -6343,6 +6514,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, 
     lea                dstq, [dstq+strideq*4]
     add               maskq, 32
 .w16:
+    _CET_ENDBR
     vpermq               m0, m0, q3120
     mova         [dstq+strideq*0], xm0
     vextracti128 [dstq+strideq*1], m0, 1
@@ -6365,6 +6537,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, 
     lea                dstq, [dstq+strideq*2]
     add               maskq, 32
 .w32:
+    _CET_ENDBR
     vpermq               m0, m0, q3120
     mova   [dstq+strideq*0], m0
     W_MASK                0, 5, 2, 3
@@ -6385,6 +6558,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, 
     add                dstq, strideq
     add               maskq, 32
 .w64:
+    _CET_ENDBR
     vpermq               m0, m0, q3120
     mova        [dstq+32*0], m0
     W_MASK                0, 5, 2, 3
@@ -6405,6 +6579,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, 
     add                dstq, strideq
     add               maskq, 32*2
 .w128:
+    _CET_ENDBR
     vpermq               m0, m0, q3120
     mova        [dstq+32*0], m0
     W_MASK                0, 5, 2, 3
@@ -6445,6 +6620,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t
     lea            stride3q, [strideq*3]
     jmp                  wq
 .w4:
+    _CET_ENDBR
     vextracti128        xm1, m0, 1
     movd   [dstq+strideq*0], xm0
     pextrd [dstq+strideq*1], xm0, 1
@@ -6481,6 +6657,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t
     lea                dstq, [dstq+strideq*4]
     add               maskq, 32
 .w8:
+    _CET_ENDBR
     vextracti128        xm1, m0, 1
     movq   [dstq+strideq*0], xm0
     movq   [dstq+strideq*1], xm1
@@ -6497,6 +6674,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t
     lea                dstq, [dstq+strideq*2]
     add               maskq, 32
 .w16:
+    _CET_ENDBR
     vpermq               m0, m0, q3120
     mova         [dstq+strideq*0], xm0
     vextracti128 [dstq+strideq*1], m0, 1
@@ -6511,6 +6689,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t
     add                dstq, strideq
     add               maskq, 32
 .w32:
+    _CET_ENDBR
     vpermq               m0, m0, q3120
     mova             [dstq], m0
     mova            [maskq], m4
@@ -6524,6 +6703,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t
     add                dstq, strideq
     add               maskq, 32*2
 .w64:
+    _CET_ENDBR
     vpermq               m0, m0, q3120
     mova        [dstq+32*0], m0
     mova       [maskq+32*0], m4
@@ -6541,6 +6721,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t
     add                dstq, strideq
     add               maskq, 32*4
 .w128:
+    _CET_ENDBR
     vpermq               m0, m0, q3120
     mova        [dstq+32*0], m0
     mova       [maskq+32*0], m4
