Index: src/x86/refmvs.asm
--- src/x86/refmvs.asm.orig
+++ src/x86/refmvs.asm
@@ -254,18 +254,21 @@ cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, 
     jg .loop_y
     RET
 .write1:
+    _CET_ENDBR
     movd    [rpq+xq+0], m0
     psrlq           m0, 8
     movd    [rpq+xq+1], m0
     add             xq, 5*1
     ret
 .write2:
+    _CET_ENDBR
     movq    [rpq+xq+0], m0
     psrlq           m0, 8
     movd    [rpq+xq+6], m0
     add             xq, 5*2
     ret
 .write4:
+    _CET_ENDBR
     pshufb          m0, m9
     movu   [rpq+xq+ 0], m0
     psrlq           m0, 8
@@ -273,6 +276,7 @@ cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, 
     add             xq, 5*4
     ret
 .write8:
+    _CET_ENDBR
     pshufb          m2, m0, m9
     movu   [rpq+xq+ 0], m2
     pshufb          m0, m10
@@ -282,6 +286,7 @@ cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, 
     add             xq, 5*8
     ret
 .write16:
+    _CET_ENDBR
     pshufb          m2, m0, m9
     movu   [rpq+xq+ 0], m2
     pshufb          m0, m10
@@ -315,6 +320,7 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
     lea             aq, [aq+bx4q*4]
     jmp           bw4q
 .w32:
+    _CET_ENDBR
     mova    [aq-16*16], m0
     mova    [aq-16*15], m1
     mova    [aq-16*14], m2
@@ -328,6 +334,7 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
     mova    [aq-16* 6], m1
     mova    [aq-16* 5], m2
 .w16:
+    _CET_ENDBR
     mova    [aq-16* 4], m0
     mova    [aq-16* 3], m1
     mova    [aq-16* 2], m2
@@ -335,10 +342,12 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
     mova    [aq+16* 0], m1
     mova    [aq+16* 1], m2
 .w8:
+    _CET_ENDBR
     mova    [aq+16* 2], m0
     mova    [aq+16* 3], m1
     mova    [aq+16* 4], m2
 .w4:
+    _CET_ENDBR
     mova    [aq+16* 5], m0
     mova    [aq+16* 6], m1
     mova    [aq+16* 7], m2
@@ -346,12 +355,14 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
     jg .loop
     RET
 .w2:
+    _CET_ENDBR
     movu      [aq+104], m0
     movq      [aq+120], m1
     dec           bh4d
     jg .loop
     RET
 .w1:
+    _CET_ENDBR
     movq      [aq+116], m0
     movd      [aq+124], m2
     dec           bh4d
@@ -650,17 +661,20 @@ cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign
     jg .loop_y
     RET
 .write1:
+    _CET_ENDBR
     movd   [rpq+xq+ 0], xm0
     pextrb [rpq+xq+ 4], xm0, 4
     add             xq, 5*1
     ret
 .write2:
+    _CET_ENDBR
     movq    [rpq+xq+0], xm0
     psrlq          xm1, xm0, 8
     movd    [rpq+xq+6], xm1
     add             xq, 5*2
     ret
 .write4:
+    _CET_ENDBR
     pshufb         xm1, xm0, xm8
     movu   [rpq+xq+ 0], xm1
     psrlq          xm1, 8
@@ -668,6 +682,7 @@ cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign
     add             xq, 5*4
     ret
 .write8:
+    _CET_ENDBR
     vinserti128     m1, m0, xm0, 1
     pshufb          m1, m8
     movu   [rpq+xq+ 0], m1
@@ -676,6 +691,7 @@ cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign
     add             xq, 5*8
     ret
 .write16:
+    _CET_ENDBR
     vinserti128     m1, m0, xm0, 1
     pshufb          m2, m1, m8
     movu   [rpq+xq+ 0], m2
@@ -704,6 +720,7 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
     lea             aq, [aq+bx4q*4]
     jmp           bw4q
 .w32:
+    _CET_ENDBR
     mova     [aq-32*8], m0
     mova     [aq-32*7], m1
     mova     [aq-32*6], m2
@@ -711,10 +728,12 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
     mova     [aq-32*4], m1
     mova     [aq-32*3], m2
 .w16:
+    _CET_ENDBR
     mova     [aq-32*2], m0
     mova     [aq-32*1], m1
     mova     [aq+32*0], m2
 .w8:
+    _CET_ENDBR
     mova     [aq+32*1], m0
     mova     [aq+32*2], m1
     mova     [aq+32*3], m2
@@ -722,18 +741,21 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
     jg .loop
     RET
 .w4:
+    _CET_ENDBR
     movu      [aq+ 80], m0
     mova      [aq+112], xm1
     dec           bh4d
     jg .loop
     RET
 .w2:
+    _CET_ENDBR
     movu      [aq+104], xm0
     movq      [aq+120], xm2
     dec           bh4d
     jg .loop
     RET
 .w1:
+    _CET_ENDBR
     movq      [aq+116], xm0
     movd      [aq+124], xm1
     dec           bh4d
@@ -833,25 +855,30 @@ cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign
     jg .loop_y
     RET
 .write1:
+    _CET_ENDBR
     vmovdqu8 [rpq+xq]{k2}, xm0
     add             xq, 5*1
     ret
 .write2:
+    _CET_ENDBR
     pshufb         xm0, xm8
     vmovdqu16 [rpq+xq]{k2}, xm0
     add             xq, 5*2
     ret
 .write4:
+    _CET_ENDBR
     vpermb         ym0, ym8, ym0
     vmovdqu32 [rpq+xq]{k2}, ym0
     add             xq, 5*4
     ret
 .write8:
+    _CET_ENDBR
     vpermb          m0, m8, m0
     vmovdqu64 [rpq+xq]{k2}, m0
     add             xq, 5*8
     ret
 .write16:
+    _CET_ENDBR
     vpermb          m1, m8, m0
     movu   [rpq+xq+ 0], m1
     pshufb         xm0, xm9
@@ -875,24 +902,28 @@ cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4
     kmovb              k1, r1d
     jmp              bw4q
 .w1:
+    _CET_ENDBR
     mov                r1, [rrq+r6*8]
     vmovdqu16 [r1+bx4q*4]{k1}, xm0
     inc                r6
     jl .w1
     RET
 .w2:
+    _CET_ENDBR
     mov                r1, [rrq+r6*8]
     vmovdqu32 [r1+bx4q*4]{k1}, ym0
     inc                r6
     jl .w2
     RET
 .w4:
+    _CET_ENDBR
     mov                r1, [rrq+r6*8]
     vmovdqu64 [r1+bx4q*4]{k1}, m0
     inc                r6
     jl .w4
     RET
 .w8:
+    _CET_ENDBR
     pshufd            ym1, ym0, q1021
 .w8_loop:
     mov                r1, [rrq+r6*8+0]
@@ -905,6 +936,7 @@ cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4
     jl .w8_loop
     RET
 .w16:
+    _CET_ENDBR
     pshufd             m1, m0, q1021
     pshufd             m2, m0, q2102
 .w16_loop:
@@ -920,6 +952,7 @@ cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4
     jl .w16_loop
     RET
 .w32:
+    _CET_ENDBR
     pshufd             m1, m0, q1021
     pshufd             m2, m0, q2102
 .w32_loop:
