Index: src/x86/mc_sse.asm
--- src/x86/mc_sse.asm.orig
+++ src/x86/mc_sse.asm
@@ -342,11 +342,13 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     test               mxyd, mxyd
     jnz .v
 .put:
+    _CET_ENDBR
     movzx                wd, word [t0+wq*2+table_offset(put,)]
     add                  wq, t0
     RESTORE_DSQ_32       t0
     jmp                  wq
 .put_w2:
+    _CET_ENDBR
     movzx               r4d, word [srcq+ssq*0]
     movzx               r6d, word [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -357,6 +359,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .put_w2
     RET
 .put_w4:
+    _CET_ENDBR
     mov                 r4d, [srcq+ssq*0]
     mov                 r6d, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -367,6 +370,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .put_w4
     RET
 .put_w8:
+    _CET_ENDBR
     movq                 m0, [srcq+ssq*0]
     movq                 m1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -377,6 +381,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .put_w8
     RET
 .put_w16:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0]
     movu                 m1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -387,6 +392,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .put_w16
     RET
 .put_w32:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0+16*0]
     movu                 m1, [srcq+ssq*0+16*1]
     movu                 m2, [srcq+ssq*1+16*0]
@@ -401,6 +407,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .put_w32
     RET
 .put_w64:
+    _CET_ENDBR
     movu                 m0, [srcq+16*0]
     movu                 m1, [srcq+16*1]
     movu                 m2, [srcq+16*2]
@@ -415,6 +422,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .put_w64
     RET
 .put_w128:
+    _CET_ENDBR
     movu                 m0, [srcq+16*0]
     movu                 m1, [srcq+16*1]
     movu                 m2, [srcq+16*2]
@@ -437,6 +445,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .put_w128
     RET
 .h:
+    _CET_ENDBR
     ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
     ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
     imul               mxyd, 0x00ff00ff
@@ -454,6 +463,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     movifnidn           dsq, dsmp
     jmp                  wq
 .h_w2:
+    _CET_ENDBR
     pshufd               m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
 .h_w2_loop:
     movd                 m0, [srcq+ssq*0]
@@ -473,6 +483,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .h_w2_loop
     RET
 .h_w4:
+    _CET_ENDBR
     movq                 m4, [srcq+ssq*0]
     movhps               m4, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -488,6 +499,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .h_w4
     RET
 .h_w8:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0]
     movu                 m1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -505,6 +517,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     movu                 m0, [srcq+8*0]
     movu                 m1, [srcq+8*1]
     add                srcq, ssq
@@ -521,6 +534,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .h_w16
     RET
 .h_w32:
+    _CET_ENDBR
     movu                 m0, [srcq+mmsize*0+8*0]
     movu                 m1, [srcq+mmsize*0+8*1]
     pshufb               m0, m4
@@ -547,6 +561,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .h_w32
     RET
 .h_w64:
+    _CET_ENDBR
     mov                  r6, -16*3
 .h_w64_loop:
     movu                 m0, [srcq+r6+16*3+8*0]
@@ -567,6 +582,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .h_w64
     RET
 .h_w128:
+    _CET_ENDBR
     mov                  r6, -16*7
 .h_w128_loop:
     movu                 m0, [srcq+r6+16*7+8*0]
@@ -587,6 +603,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .h_w128
     RET
 .v:
+    _CET_ENDBR
     movzx                wd, word [t0+wq*2+table_offset(put, _bilin_v)]
     imul               mxyd, 0x00ff00ff
     mova                 m5, [base+pw_2048]
@@ -597,6 +614,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     movifnidn           dsq, dsmp
     jmp                  wq
 .v_w2:
+    _CET_ENDBR
     movd                 m0, [srcq+ssq*0]
 .v_w2_loop:
     pinsrw               m0, [srcq+ssq*1], 1 ; 0 1
@@ -616,6 +634,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .v_w2_loop
     RET
 .v_w4:
+    _CET_ENDBR
     movd                 m0, [srcq+ssq*0]
 .v_w4_loop:
     movd                 m2, [srcq+ssq*1]
@@ -637,6 +656,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     movq                 m0, [srcq+ssq*0]
 .v_w8_loop:
     movq                 m2, [srcq+ssq*1]
@@ -685,15 +705,19 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg %%loop
 %endmacro
 .v_w16:
+    _CET_ENDBR
     PUT_BILIN_V_W16
     RET
 .v_w128:
+    _CET_ENDBR
     lea                 r6d, [hq+(7<<16)]
     jmp .v_w16gt
 .v_w64:
+    _CET_ENDBR
     lea                 r6d, [hq+(3<<16)]
     jmp .v_w16gt
 .v_w32:
+    _CET_ENDBR
     lea                 r6d, [hq+(1<<16)]
 .v_w16gt:
     mov                  r4, srcq
@@ -720,6 +744,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .v_w16gt
     RET
 .hv:
+    _CET_ENDBR
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
     ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
     movzx                wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
@@ -733,6 +758,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     punpcklqdq           m6, m6
     jmp                  wq
 .hv_w2:
+    _CET_ENDBR
     RESTORE_DSQ_32       t0
     movd                 m0, [srcq+ssq*0]
     punpckldq            m0, m0
@@ -767,6 +793,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .hv_w2_loop
     RET
 .hv_w4:
+    _CET_ENDBR
     mova                 m4, [base+bilin_h_shuf4]
     movddup              m0, [srcq+ssq*0]
     movifnidn           dsq, dsmp
@@ -794,6 +821,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0]
     movifnidn           dsq, dsmp
     pshufb               m0, m4
@@ -824,12 +852,15 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     jg .hv_w8_loop
     RET
 .hv_w128:
+    _CET_ENDBR
     lea                 r6d, [hq+(7<<16)]
     jmp .hv_w16_start
 .hv_w64:
+    _CET_ENDBR
     lea                 r6d, [hq+(3<<16)]
     jmp .hv_w16_start
 .hv_w32:
+    _CET_ENDBR
     lea                 r6d, [hq+(1<<16)]
 .hv_w16_start:
     mov                  r4, srcq
@@ -839,6 +870,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, 
     mov                  r7, dstq
 %endif
 .hv_w16:
+    _CET_ENDBR
     movifnidn           dsq, dsmp
 %if WIN64
     movaps              r4m, m8
@@ -915,12 +947,14 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     test               mxyd, mxyd
     jnz .v
 .prep:
+    _CET_ENDBR
     movzx                wd, word [r6+wq*2+table_offset(prep,)]
     pxor                 m4, m4
     add                  wq, r6
     lea            stride3q, [strideq*3]
     jmp                  wq
 .prep_w4:
+    _CET_ENDBR
     movd                 m0, [srcq+strideq*0]
     movd                 m1, [srcq+strideq*1]
     movd                 m2, [srcq+strideq*2]
@@ -939,6 +973,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .prep_w4
     RET
 .prep_w8:
+    _CET_ENDBR
     movq                 m0, [srcq+strideq*0]
     movq                 m1, [srcq+strideq*1]
     movq                 m2, [srcq+strideq*2]
@@ -961,6 +996,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .prep_w8
     RET
 .prep_w16:
+    _CET_ENDBR
     movu                 m1, [srcq+strideq*0]
     movu                 m3, [srcq+strideq*1]
     lea                srcq, [srcq+strideq*2]
@@ -981,12 +1017,15 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .prep_w16
     RET
 .prep_w128:
+    _CET_ENDBR
     mov                  r3, -128
     jmp .prep_w32_start
 .prep_w64:
+    _CET_ENDBR
     mov                  r3, -64
     jmp .prep_w32_start
 .prep_w32:
+    _CET_ENDBR
     mov                  r3, -32
 .prep_w32_start:
     sub                srcq, r3
@@ -1015,6 +1054,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .prep_w32_vloop
     RET
 .h:
+    _CET_ENDBR
     ; 16 * src[x] + (mx * (src[x + 1] - src[x]))
     ; = (16 - mx) * src[x] + mx * src[x + 1]
     imul               mxyd, 0x00ff00ff
@@ -1029,6 +1069,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     add                  wq, r6
     jmp                  wq
 .h_w4:
+    _CET_ENDBR
     mova                 m4, [base+bilin_h_shuf4]
     lea            stride3q, [strideq*3]
 .h_w4_loop:
@@ -1048,6 +1089,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .h_w4_loop
     RET
 .h_w8:
+    _CET_ENDBR
     lea            stride3q, [strideq*3]
 .h_w8_loop:
     movu                 m0, [srcq+strideq*0]
@@ -1066,6 +1108,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .h_w8_loop
     RET
 .h_w16:
+    _CET_ENDBR
     movu                 m0, [srcq+strideq*0+8*0]
     movu                 m1, [srcq+strideq*0+8*1]
     movu                 m2, [srcq+strideq*1+8*0]
@@ -1082,12 +1125,15 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .h_w16
     RET
 .h_w128:
+    _CET_ENDBR
     mov                  r3, -128
     jmp .h_w32_start
 .h_w64:
+    _CET_ENDBR
     mov                  r3, -64
     jmp .h_w32_start
 .h_w32:
+    _CET_ENDBR
     mov                  r3, -32
 .h_w32_start:
     sub                srcq, r3
@@ -1112,6 +1158,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .h_w32_vloop
     RET
 .v:
+    _CET_ENDBR
     movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
     imul               mxyd, 0x00ff00ff
     add                mxyd, 0x00100010
@@ -1121,6 +1168,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     pshufd               m5, m5, q0000
     jmp                  wq
 .v_w4:
+    _CET_ENDBR
     movd                 m0, [srcq+strideq*0]
 .v_w4_loop:
     movd                 m1, [srcq+strideq*1]
@@ -1143,6 +1191,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .v_w4_loop
     RET
 .v_w8:
+    _CET_ENDBR
     movq                 m0, [srcq+strideq*0]
 .v_w8_loop:
     movq                 m1, [srcq+strideq*1]
@@ -1167,6 +1216,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .v_w8_loop
     RET
 .v_w16:
+    _CET_ENDBR
     movu                 m0, [srcq+strideq*0]
 .v_w16_loop:
     movu                 m1, [srcq+strideq*1]
@@ -1203,14 +1253,17 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .v_w16_loop
     RET
 .v_w128:
+    _CET_ENDBR
     lea                 r3d, [hq+(3<<8)]
     mov                 r6d, 256
     jmp .v_w32_start
 .v_w64:
+    _CET_ENDBR
     lea                 r3d, [hq+(1<<8)]
     mov                 r6d, 128
     jmp .v_w32_start
 .v_w32:
+    _CET_ENDBR
     xor                 r3d, r3d
     mov                 r6d, 64
 .v_w32_start:
@@ -1276,6 +1329,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
 %endif
     RET
 .hv:
+    _CET_ENDBR
     ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
     ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
     movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
@@ -1286,6 +1340,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     pshufd               m6, m6, q0000
     jmp                  wq
 .hv_w4:
+    _CET_ENDBR
     mova                 m4, [base+bilin_h_shuf4]
     movddup              m0, [srcq+strideq*0]
     lea                  r3, [strideq*3]
@@ -1317,6 +1372,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     movu                 m0, [srcq+strideq*0]
     pshufb               m0, m4
     pmaddubsw            m0, m5 ; 0
@@ -1342,18 +1398,22 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
     jg .hv_w8_loop
     RET
 .hv_w128:
+    _CET_ENDBR
     lea                 r3d, [hq+(7<<8)]
     mov                 r5d, 256
     jmp .hv_w16_start
 .hv_w64:
+    _CET_ENDBR
     lea                 r3d, [hq+(3<<8)]
     mov                 r5d, 128
     jmp .hv_w16_start
 .hv_w32:
+    _CET_ENDBR
     lea                 r3d, [hq+(1<<8)]
     mov                 r5d, 64
     jmp .hv_w16_start
 .hv_w16:
+    _CET_ENDBR
     xor                 r3d, r3d
     mov                 r5d, 32
 .hv_w16_start:
@@ -1488,6 +1548,7 @@ cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
 %endif
     jnz .v
 .put:
+    _CET_ENDBR
     tzcnt                wd, wd
     movzx                wd, word [base_reg+wq*2+table_offset(put,)]
     movifnidn           ssq, ssmp
@@ -1499,6 +1560,7 @@ cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     lea                  r6, [ssq*3]
     jmp                  wq
 .h:
+    _CET_ENDBR
 %if ARCH_X86_32
     test                ssd, 0xf00
 %else
@@ -1546,6 +1608,7 @@ cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     mov                  r4, dsm
 %endif
 .h_w8:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0]
     movu                 m1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -1565,6 +1628,7 @@ cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     add                srcq, wq
     add                dstq, wq
     neg                  wq
@@ -1585,6 +1649,7 @@ cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     jg .h_w16_loop_v
     RET
 .v:
+    _CET_ENDBR
 %if ARCH_X86_32
     %define             dsq  r4
     %define              m8  [base+pw_512]
@@ -1621,6 +1686,7 @@ cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     jg .v_w8
 %endif
 .v_w2:
+    _CET_ENDBR
 %if ARCH_X86_32
     mov                 dsq, dsm
     movd                 m1, [srcq+r6 *2]
@@ -1663,6 +1729,7 @@ cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     jg .v_w2_loop
     RET
 .v_w4:
+    _CET_ENDBR
 %if ARCH_X86_32
     shl                  wd, 14
     lea                srcq, [srcq+r6*2]
@@ -1722,6 +1789,7 @@ cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     RET
 %if ARCH_X86_64
 .v_w8:
+    _CET_ENDBR
     WIN64_PUSH_XMM       12
     shl                  wd, 5
     lea                 r6d, [hq+wq-256]
@@ -1771,6 +1839,7 @@ cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     RET
 %endif ;ARCH_X86_64
 .hv:
+    _CET_ENDBR
     RESET_STACK_STATE
     cmp                  wd, 4
     jg .hv_w8
@@ -1820,6 +1889,7 @@ cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     cmp                  wd, 4
     je .hv_w4
 .hv_w2:
+    _CET_ENDBR
     mova                 m5, [base+subpel_h_shuf4]
     mova                 m6, [base+pw_34]
     pshufd               m7, m1, q0000
@@ -1877,6 +1947,7 @@ cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     jg .hv_w2_loop
     RET
 .hv_w4:
+    _CET_ENDBR
 %if ARCH_X86_32
     movq                 m3, [srcq+ssq*0]
     movq                 m4, [srcq+ssq*1]
@@ -1961,6 +2032,7 @@ cglobal put_6tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     RESET_STACK_STATE
     shr                 mxd, 16
     sub                srcq, 2
@@ -2241,6 +2313,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     lea                  r6, [ssq*3]
     jmp                  wq
 .h_w2:
+    _CET_ENDBR
     mova                 m3, [base+subpel_h_shuf4]
     movifnidn           dsq, dsmp
 .h_w2_loop:
@@ -2262,6 +2335,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     jg .h_w2_loop
     RET
 .h_w4:
+    _CET_ENDBR
 %if ARCH_X86_32
     and                 mxd, 0x7f
 %else
@@ -2294,6 +2368,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     jg .h_w4_loop
     RET
 .h:
+    _CET_ENDBR
 %if ARCH_X86_32
     test                ssd, 0xf00
 %else
@@ -2338,6 +2413,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     psraw               %1, 6
 %endmacro
 .h_w8:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0]
     movu                 m1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -2358,6 +2434,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     add                srcq, wq
     add                dstq, wq
     neg                  wq
@@ -2378,6 +2455,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     jg .h_w16_loop_v
     RET
 .v:
+    _CET_ENDBR
 %if ARCH_X86_32
     movzx               mxd, ssb
     shr                 ssd, 16
@@ -2433,6 +2511,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     jg .v_w8
 %endif
 .v_w2:
+    _CET_ENDBR
     movd                 m1, [srcq+ssq*0]
     movd                 m0, [srcq+ssq*1]
 %if ARCH_X86_32
@@ -2488,6 +2567,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     jg .v_w2_loop
     RET
 .v_w4:
+    _CET_ENDBR
 %if ARCH_X86_32
     shl                  wd, 14
 %if STACK_ALIGNMENT < 16
@@ -2563,6 +2643,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     RET
 %if ARCH_X86_64
 .v_w8:
+    _CET_ENDBR
     shl                  wd, 5
     lea                 r6d, [hq+wq-256]
 .v_w8_loop0:
@@ -2629,6 +2710,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
 %undef subpel2
 %undef subpel3
 .hv:
+    _CET_ENDBR
     RESET_STACK_STATE
     cmp                  wd, 4
     jg .hv_w8
@@ -2694,6 +2776,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
     cmp                  wd, 4
     je .hv_w4
 .hv_w2:
+    _CET_ENDBR
     mova                 m6, [base+subpel_h_shuf4]
     movq                 m2, [srcq+ssq*0]     ; 0
     movhps               m2, [srcq+ssq*1]     ; 0 _ 1
@@ -2773,6 +2856,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
 %undef w8192reg
 %undef d512reg
 .hv_w4:
+    _CET_ENDBR
 %define hv4_line_0_0 4
 %define hv4_line_0_1 5
 %define hv4_line_0_2 6
@@ -2950,6 +3034,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
 %undef subpelv2
 %undef subpelv3
 .hv_w8:
+    _CET_ENDBR
     RESET_STACK_STATE
 %define hv8_line_1 0
 %define hv8_line_2 1
@@ -3247,6 +3332,7 @@ cglobal prep_6tap_8bpc, 1, 9, 0, tmp, src, ss, w, h, m
     test                myd, 0xf00
     jnz .v
 .prep:
+    _CET_ENDBR
     tzcnt                wd, wd
     movzx                wd, word [base_reg-prep_ssse3+prep_ssse3_table+wq*2]
     pxor                 m4, m4
@@ -3259,6 +3345,7 @@ cglobal prep_6tap_8bpc, 1, 9, 0, tmp, src, ss, w, h, m
 %endif
     jmp                  wq
 .h:
+    _CET_ENDBR
     test                myd, 0xf00
     jnz .hv
     test                myd, 0xf00
@@ -3303,6 +3390,7 @@ cglobal prep_6tap_8bpc, 1, 9, 0, tmp, src, ss, w, h, m
     pmulhrsw             %1, m5
 %endmacro
 .h_w8:
+    _CET_ENDBR
     movu                 m0, [srcq+ssq*0]
     movu                 m1, [srcq+ssq*1]
     lea                srcq, [srcq+ssq*2]
@@ -3315,6 +3403,7 @@ cglobal prep_6tap_8bpc, 1, 9, 0, tmp, src, ss, w, h, m
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     add                srcq, wq
     neg                  wq
 .h_w16_loop_v:
@@ -3334,6 +3423,7 @@ cglobal prep_6tap_8bpc, 1, 9, 0, tmp, src, ss, w, h, m
     jg .h_w16_loop_v
     RET
 .v:
+    _CET_ENDBR
 %if ARCH_X86_32
     mov                 mxd, myd
     and                 mxd, 0x7f
@@ -3362,6 +3452,7 @@ cglobal prep_6tap_8bpc, 1, 9, 0, tmp, src, ss, w, h, m
     jg .v_w8
 %endif
 .v_w4:
+    _CET_ENDBR
 %if ARCH_X86_32
     lea                 r5d, [wq-4]
     shl                 r5d, 14
@@ -3423,6 +3514,7 @@ cglobal prep_6tap_8bpc, 1, 9, 0, tmp, src, ss, w, h, m
     RET
 %if ARCH_X86_64
 .v_w8:
+    _CET_ENDBR
     WIN64_PUSH_XMM       12
     lea                 r6d, [wq*4-32]
     lea                 r6d, [r6*8+hq]
@@ -3471,6 +3563,7 @@ cglobal prep_6tap_8bpc, 1, 9, 0, tmp, src, ss, w, h, m
     RET
 %endif ;ARCH_X86_64
 .hv:
+    _CET_ENDBR
     RESET_STACK_STATE
     cmp                  wd, 4
     jg .hv_w8
@@ -3597,6 +3690,7 @@ cglobal prep_6tap_8bpc, 1, 9, 0, tmp, src, ss, w, h, m
     jg .hv_w4_loop
     RET
 .hv_w8:
+    _CET_ENDBR
     RESET_STACK_STATE
     shr                 mxd, 16
     sub                srcq, 2
@@ -3856,6 +3950,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, 
     test                myd, 0xf00
     jz mangle(private_prefix %+ _prep_6tap_8bpc_ssse3).prep
 .v:
+    _CET_ENDBR
 %if ARCH_X86_32
     mov                 mxd, myd
     and                 mxd, 0x7f
@@ -3904,6 +3999,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, 
     jns .v_w8
 %endif
 .v_w4:
+    _CET_ENDBR
 %if ARCH_X86_32
  %if STACK_ALIGNMENT < mmsize
   %define srcm [esp+stack_size+gprsize*1]
@@ -3982,6 +4078,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, 
     RET
 %if ARCH_X86_64
 .v_w8:
+    _CET_ENDBR
     lea                 r6d, [wq*8-64]
     mov                  r5, srcq
     mov                  r8, tmpq
@@ -4049,6 +4146,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, 
 %undef subpel2
 %undef subpel3
 .h_w4:
+    _CET_ENDBR
     WIN64_SPILL_XMM       7
 %if ARCH_X86_32
     and                 mxd, 0x7f
@@ -4081,6 +4179,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, 
     jg .h_w4_loop
     RET
 .h:
+    _CET_ENDBR
     test                myd, 0xf00
     jnz .hv
     cmp                  wd, 4
@@ -4125,6 +4224,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, 
     pmulhrsw            m%1, m7
 %endmacro
 .h_w8:
+    _CET_ENDBR
     PREP_8TAP_H           0, srcq+strideq*0
     PREP_8TAP_H           1, srcq+strideq*1
     mova        [tmpq+16*0], m0
@@ -4135,15 +4235,19 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, 
     jg .h_w8
     RET
 .h_w16:
+    _CET_ENDBR
     mov                  r3, -16*1
     jmp .h_start
 .h_w32:
+    _CET_ENDBR
     mov                  r3, -16*2
     jmp .h_start
 .h_w64:
+    _CET_ENDBR
     mov                  r3, -16*4
     jmp .h_start
 .h_w128:
+    _CET_ENDBR
     mov                  r3, -16*8
 .h_start:
     sub                srcq, r3
@@ -4162,6 +4266,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, 
     jg .h_loop
     RET
 .hv:
+    _CET_ENDBR
     RESET_STACK_STATE
     cmp                  wd, 4
     jg .hv_w8
@@ -4385,6 +4490,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, 
 %undef subpelv2
 %undef subpelv3
 .hv_w8:
+    _CET_ENDBR
     RESET_STACK_STATE
 %define hv8_line_1 0
 %define hv8_line_2 1
@@ -5010,6 +5116,7 @@ cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, sr
     jmp                  wq
 %ifidn %1, put
 .w2:
+    _CET_ENDBR
  %if ARCH_X86_64
     mov                 myd, mym
     movzx               t0d, t0b
@@ -5244,6 +5351,7 @@ cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, sr
 %endif
 INIT_XMM ssse3
 .w4:
+    _CET_ENDBR
 %if ARCH_X86_64
     mov                 myd, mym
     movzx               t0d, t0b
@@ -5589,22 +5697,27 @@ INIT_XMM ssse3
     jmp .w4_loop
 INIT_XMM ssse3
 .w8:
+    _CET_ENDBR
     mov    dword [rsp+0x90], 1
     movifprep   tmp_stridem, 16
     jmp .w_start
 .w16:
+    _CET_ENDBR
     mov    dword [rsp+0x90], 2
     movifprep   tmp_stridem, 32
     jmp .w_start
 .w32:
+    _CET_ENDBR
     mov    dword [rsp+0x90], 4
     movifprep   tmp_stridem, 64
     jmp .w_start
 .w64:
+    _CET_ENDBR
     mov    dword [rsp+0x90], 8
     movifprep   tmp_stridem, 128
     jmp .w_start
 .w128:
+    _CET_ENDBR
     mov    dword [rsp+0x90], 16
     movifprep   tmp_stridem, 256
 .w_start:
@@ -6120,11 +6233,13 @@ INIT_XMM ssse3
     jmp .vloop
 INIT_XMM ssse3
 .dy1:
+    _CET_ENDBR
     movzx                wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
     add                  wq, base_reg
     jmp                  wq
 %ifidn %1, put
 .dy1_w2:
+    _CET_ENDBR
  %if ARCH_X86_64
     mov                 myd, mym
     movzx               t0d, t0b
@@ -6299,6 +6414,7 @@ INIT_XMM ssse3
 %endif
 INIT_XMM ssse3
 .dy1_w4:
+    _CET_ENDBR
 %if ARCH_X86_64
     mov                 myd, mym
     movzx               t0d, t0b
@@ -6633,22 +6749,27 @@ INIT_XMM ssse3
     jmp .dy1_w4_loop
 INIT_XMM ssse3
 .dy1_w8:
+    _CET_ENDBR
     mov    dword [rsp+0x90], 1
     movifprep   tmp_stridem, 16
     jmp .dy1_w_start
 .dy1_w16:
+    _CET_ENDBR
     mov    dword [rsp+0x90], 2
     movifprep   tmp_stridem, 32
     jmp .dy1_w_start
 .dy1_w32:
+    _CET_ENDBR
     mov    dword [rsp+0x90], 4
     movifprep   tmp_stridem, 64
     jmp .dy1_w_start
 .dy1_w64:
+    _CET_ENDBR
     mov    dword [rsp+0x90], 8
     movifprep   tmp_stridem, 128
     jmp .dy1_w_start
 .dy1_w128:
+    _CET_ENDBR
     mov    dword [rsp+0x90], 16
     movifprep   tmp_stridem, 256
 .dy1_w_start:
@@ -7099,11 +7220,13 @@ INIT_XMM ssse3
     jmp .dy1_vloop
 INIT_XMM ssse3
 .dy2:
+    _CET_ENDBR
     movzx                wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
     add                  wq, base_reg
     jmp                  wq
 %ifidn %1, put
 .dy2_w2:
+    _CET_ENDBR
  %if ARCH_X86_64
     mov                 myd, mym
     movzx               t0d, t0b
@@ -7285,6 +7408,7 @@ INIT_XMM ssse3
 %endif
 INIT_XMM ssse3
 .dy2_w4:
+    _CET_ENDBR
 %if ARCH_X86_64
     mov                 myd, mym
     movzx               t0d, t0b
@@ -7535,22 +7659,27 @@ INIT_XMM ssse3
     MC_8TAP_SCALED_RET
 INIT_XMM ssse3
 .dy2_w8:
+    _CET_ENDBR
     mov    dword [rsp+0x90], 1
     movifprep   tmp_stridem, 16
     jmp .dy2_w_start
 .dy2_w16:
+    _CET_ENDBR
     mov    dword [rsp+0x90], 2
     movifprep   tmp_stridem, 32
     jmp .dy2_w_start
 .dy2_w32:
+    _CET_ENDBR
     mov    dword [rsp+0x90], 4
     movifprep   tmp_stridem, 64
     jmp .dy2_w_start
 .dy2_w64:
+    _CET_ENDBR
     mov    dword [rsp+0x90], 8
     movifprep   tmp_stridem, 128
     jmp .dy2_w_start
 .dy2_w128:
+    _CET_ENDBR
     mov    dword [rsp+0x90], 16
     movifprep   tmp_stridem, 256
 .dy2_w_start:
@@ -8497,6 +8626,7 @@ ALIGN function_align
     ret
 ALIGN function_align
 .h:
+    _CET_ENDBR
 %if ARCH_X86_32
  %define m8  m3
  %define m9  m4
@@ -8577,6 +8707,7 @@ DECLARE_REG_TMP 6, 7
     %1                    0
     lea                dstq, [dstq+strideq*4]
 .w4: ; tile 4x
+    _CET_ENDBR
     movd   [dstq          ], m0      ; copy dw[0]
     pshuflw              m1, m0, q1032 ; swap dw[1] and dw[0]
     movd   [dstq+strideq*1], m1      ; copy dw[1]
@@ -8592,6 +8723,7 @@ DECLARE_REG_TMP 6, 7
     %1                    0
     lea                dstq, [dstq+strideq*2]
 .w8:
+    _CET_ENDBR
     movq   [dstq          ], m0
     movhps [dstq+strideq*1], m0
     sub                  hd, 2
@@ -8602,6 +8734,7 @@ DECLARE_REG_TMP 6, 7
     %1                    0
     lea                dstq, [dstq+strideq]
 .w16:
+    _CET_ENDBR
     mova   [dstq          ], m0
     dec                  hd
     jg .w16_loop
@@ -8611,6 +8744,7 @@ DECLARE_REG_TMP 6, 7
     %1                    0
     lea                dstq, [dstq+strideq]
 .w32:
+    _CET_ENDBR
     mova   [dstq          ], m0
     %1                    2
     mova   [dstq + 16     ], m0
@@ -8622,6 +8756,7 @@ DECLARE_REG_TMP 6, 7
     %1                    0
     add                dstq, strideq
 .w64:
+    _CET_ENDBR
     %assign i 0
     %rep 4
     mova   [dstq + i*16   ], m0
@@ -8638,6 +8773,7 @@ DECLARE_REG_TMP 6, 7
     %1                    0
     add                dstq, strideq
 .w128:
+    _CET_ENDBR
     %assign i 0
     %rep 8
     mova   [dstq + i*16   ], m0
@@ -8819,6 +8955,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t
     add               maskq, 4
     lea                dstq, [dstq+strideq*2]
 .w4:
+    _CET_ENDBR
     pshufd               m3, m2, q2020
     pshufd               m2, m2, q3131
     psubw                m1, m7, m3
@@ -8842,6 +8979,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t
     add               maskq, 4
     lea                dstq, [dstq+strideq*2]
 .w8:
+    _CET_ENDBR
     movhlps              m3, m2
     psubw                m1, m7, m2
     psubw                m1, m3
@@ -8858,6 +8996,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t
     add               maskq, 8
     lea                dstq, [dstq+strideq*2]
 .w16:
+    _CET_ENDBR
     mova   [dstq+strideq*1], m2
     mova   [dstq+strideq*0], m0
     call .main
@@ -8875,6 +9014,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t
     add               maskq, 16
     lea                dstq, [dstq+strideq*2]
 .w32:
+    _CET_ENDBR
     mova            [maskq], m2
     mova [dstq+strideq*0+16*0], m0
     call .main
@@ -8889,6 +9029,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t
     add               maskq, 16*2
     lea                dstq, [dstq+strideq*2]
 .w64:
+    _CET_ENDBR
     mova       [maskq+16*0], m2
     mova [dstq+strideq*0+16*0], m0
     call .main
@@ -8909,6 +9050,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t
     add               maskq, 16*4
     lea                dstq, [dstq+strideq*2]
 .w128:
+    _CET_ENDBR
     mova       [maskq+16*0], m2
     mova [dstq+strideq*0+16*0], m0
     call .main
@@ -9012,6 +9154,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, 
     add               maskq, 8
     lea                dstq, [dstq+strideq*2]
 .w4:
+    _CET_ENDBR
     packuswb             m2, m2
     psubb                m1, m7, m2
 %if ARCH_X86_64
@@ -9037,6 +9180,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, 
     add               maskq, 16
     lea                dstq, [dstq+strideq*2]
 .w8:
+    _CET_ENDBR
     W_MASK_422_BACKUP     0
     movq   [dstq+strideq*0], m0
     movhps [dstq+strideq*1], m0
@@ -9053,6 +9197,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, 
     add               maskq, 16
     lea                dstq, [dstq+strideq*2]
 .w16:
+    _CET_ENDBR
     W_MASK_422_BACKUP     0
     mova   [dstq+strideq*0], m0
     call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
@@ -9066,6 +9211,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, 
     add               maskq, 16
     add                dstq, strideq
 .w32:
+    _CET_ENDBR
     W_MASK_422_BACKUP     0
     mova        [dstq+16*0], m0
     call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
@@ -9079,6 +9225,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, 
     add               maskq, 16*2
     add                dstq, strideq
 .w64:
+    _CET_ENDBR
     W_MASK_422_BACKUP     0
     mova        [dstq+16*0], m0
     call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
@@ -9098,6 +9245,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, 
     add               maskq, 16*4
     add                dstq, strideq
 .w128:
+    _CET_ENDBR
     W_MASK_422_BACKUP     0
     mova        [dstq+16*0], m0
     call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
@@ -9148,6 +9296,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t
     call .main
     lea                dstq, [dstq+strideq*2]
 .w4:
+    _CET_ENDBR
     movd   [dstq+strideq*0], m0
     pshuflw              m1, m0, q1032
     movd   [dstq+strideq*1], m1
@@ -9163,6 +9312,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t
     call .main
     lea                dstq, [dstq+strideq*2]
 .w8:
+    _CET_ENDBR
     movq   [dstq+strideq*0], m0
     movhps [dstq+strideq*1], m0
     sub                  hd, 2
@@ -9172,6 +9322,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t
     call .main
     lea                dstq, [dstq+strideq*2]
 .w16:
+    _CET_ENDBR
     mova   [dstq+strideq*0], m0
     call .main
     mova   [dstq+strideq*1], m0
@@ -9182,6 +9333,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t
     call .main
     add                dstq, strideq
 .w32:
+    _CET_ENDBR
     mova        [dstq+16*0], m0
     call .main
     mova        [dstq+16*1], m0
@@ -9192,6 +9344,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t
     call .main
     add                dstq, strideq
 .w64:
+    _CET_ENDBR
     mova        [dstq+16*0], m0
     call .main
     mova        [dstq+16*1], m0
@@ -9206,6 +9359,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t
     call .main
     add                dstq, strideq
 .w128:
+    _CET_ENDBR
     mova        [dstq+16*0], m0
     call .main
     mova        [dstq+16*1], m0
@@ -9284,6 +9438,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
     lea                  r6, [dsq*3]
     jmp                  wq
 .w4:
+    _CET_ENDBR
     movq                 m0, [maskq]; m
     movd                 m1, [dstq+dsq*0] ; a
     movd                 m6, [dstq+dsq*1]
@@ -9305,6 +9460,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
     jg .w4
     RET
 .w8:
+    _CET_ENDBR
     mova                 m0, [maskq]; m
     movq                 m1, [dstq+dsq*0] ; a
     movhps               m1, [dstq+dsq*1]
@@ -9319,6 +9475,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
     jg .w8
     RET
 .w16:
+    _CET_ENDBR
     mova                 m0, [maskq]; m
     mova                 m1, [dstq] ; a
     mova                 m6, [tmpq] ; b
@@ -9331,6 +9488,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
     jg .w16
     RET
 .w32:
+    _CET_ENDBR
     %assign i 0
     %rep 2
     mova                 m0, [maskq+16*i]; m
@@ -9358,6 +9516,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
     add               maskq, obmc_masks-blend_v_ssse3_table
     jmp                  wq
 .w2:
+    _CET_ENDBR
     movd                 m3, [maskq+4]
     punpckldq            m3, m3
     ; 2 mask blend is provided for 4 pixels / 2 lines
@@ -9379,6 +9538,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
     jg .w2_loop
     RET
 .w4:
+    _CET_ENDBR
     movddup              m3, [maskq+8]
     ; 4 mask blend is provided for 8 pixels / 2 lines
 .w4_loop:
@@ -9399,6 +9559,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
     jg .w4_loop
     RET
 .w8:
+    _CET_ENDBR
     mova                 m3, [maskq+16]
     ; 8 mask blend is provided for 16 pixels
 .w8_loop:
@@ -9414,6 +9575,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
     jg .w8_loop
     RET
 .w16:
+    _CET_ENDBR
     ; 16 mask blend is provided for 32 pixels
     mova                  m3, [maskq+32] ; obmc_masks_16[0] (64-m[0])
     mova                  m4, [maskq+48] ; obmc_masks_16[1] (64-m[1])
@@ -9428,6 +9590,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
     jg .w16_loop
     RET
 .w32:
+    _CET_ENDBR
 %if WIN64
     mova            [rsp+8], xmm6
 %endif
@@ -9477,6 +9640,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
     neg                  hq
     jmp                  wq
 .w2:
+    _CET_ENDBR
     movd                 m0, [dstq+dsq*0]
     pinsrw               m0, [dstq+dsq*1], 1
     movd                 m2, [maskq+hq*2]
@@ -9496,6 +9660,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
     jl .w2
     RET
 .w4:
+    _CET_ENDBR
 %if ARCH_X86_32
     mova                 m3, [base+blend_shuf]
 %else
@@ -9521,6 +9686,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
     jl .w4_loop
     RET
 .w8:
+    _CET_ENDBR
     movd                 m4, [maskq+hq*2]
     punpcklwd            m4, m4
     pshufd               m3, m4, q0000
@@ -9538,6 +9704,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
     RET
 ; w16/w32/w64/w128
 .w16:
+    _CET_ENDBR
 %if ARCH_X86_32
     mov                 r6d, wm
 %endif
