$OpenBSD: patch-lib_Target_X86_X86FrameLowering_cpp,v 1.8 2020/08/05 06:49:48 jca Exp $

- Add RETGUARD to clang for amd64. This security mechanism uses per-function
  random cookies to protect access to function return instructions, with the
  effect that the integrity of the return address is protected, and function
  return instructions are harder to use in ROP gadgets.

  On function entry the return address is combined with a per-function random
  cookie and stored in the stack frame. The integrity of this value is verified
  before function return, and if this check fails, the program aborts. In this way
  RETGUARD is an improved stack protector, since the cookies are per-function. The
  verification routine is constructed such that the binary space immediately
  before each ret instruction is padded with int03 instructions, which makes these
  return instructions difficult to use in ROP gadgets. In the kernel, this has the
  effect of removing approximately 50% of total ROP gadgets, and 15% of unique
  ROP gadgets compared to the 6.3 release kernel. Function epilogues are
  essentially gadget free, leaving only the polymorphic gadgets that result from
  jumping into the instruction stream partway through other instructions. Work to
  remove these gadgets will continue through other mechanisms.
- Refactor retguard to make adding additional arches easier.
- implement -msave-args in clang/llvm, like the sun did for gcc

Index: lib/Target/X86/X86FrameLowering.cpp
--- lib/Target/X86/X86FrameLowering.cpp.orig
+++ lib/Target/X86/X86FrameLowering.cpp
@@ -14,6 +14,7 @@
 #include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
+#include "X86ReturnProtectorLowering.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/SmallSet.h"
@@ -38,7 +39,7 @@ X86FrameLowering::X86FrameLowering(const X86Subtarget 
                                    MaybeAlign StackAlignOverride)
     : TargetFrameLowering(StackGrowsDown, StackAlignOverride.valueOrOne(),
                           STI.is64Bit() ? -8 : -4),
-      STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {
+      STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()), RPL() {
   // Cache a bunch of frame-related predicates for this subtarget.
   SlotSize = TRI->getSlotSize();
   Is64Bit = STI.is64Bit();
@@ -46,6 +47,7 @@ X86FrameLowering::X86FrameLowering(const X86Subtarget 
   // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
   Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
   StackPtr = TRI->getStackRegister();
+  SaveArgs = Is64Bit ? STI.getSaveArgs() : 0;
 }
 
 bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
@@ -89,7 +91,8 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF
           MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
           MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() ||
           MFI.hasStackMap() || MFI.hasPatchPoint() ||
-          MFI.hasCopyImplyingStackAdjustment());
+          MFI.hasCopyImplyingStackAdjustment() ||
+          SaveArgs);
 }
 
 static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
@@ -872,6 +875,24 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasic
   MI->getOperand(3).setIsDead();
 }
 
+// FIXME: Get this from tablegen.
+static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
+                                                const X86Subtarget &Subtarget) {
+  assert(Subtarget.is64Bit());
+
+  if (Subtarget.isCallingConvWin64(CallConv)) {
+    static const MCPhysReg GPR64ArgRegsWin64[] = {
+      X86::RCX, X86::RDX, X86::R8,  X86::R9
+    };
+    return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
+  }
+
+  static const MCPhysReg GPR64ArgRegs64Bit[] = {
+    X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+  };
+  return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
+}
+
 bool X86FrameLowering::has128ByteRedZone(const MachineFunction& MF) const {
   // x86-64 (non Win64) has a 128 byte red zone which is guaranteed not to be
   // clobbered by any interrupt handler.
@@ -1146,6 +1167,43 @@ void X86FrameLowering::emitPrologue(MachineFunction &M
                                     nullptr, DwarfFramePtr));
       }
 
+      if (SaveArgs && !Fn.arg_empty()) {
+        ArrayRef<MCPhysReg> GPRs =
+          get64BitArgumentGPRs(Fn.getCallingConv(), STI);
+        unsigned arg_size = Fn.arg_size();
+        unsigned RI = 0;
+        int64_t SaveSize = 0;
+
+        if (Fn.hasStructRetAttr()) {
+          GPRs = GPRs.drop_front(1);
+          arg_size--;
+        }
+
+        for (MCPhysReg Reg : GPRs) {
+          if (++RI > arg_size)
+            break;
+
+          SaveSize += SlotSize;
+
+          BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
+            .addReg(Reg)
+            .setMIFlag(MachineInstr::FrameSetup);
+        }
+
+        // Realign the stack. PUSHes are the most space efficient.
+        while (SaveSize % getStackAlignment()) {
+          BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
+            .addReg(GPRs.front())
+            .setMIFlag(MachineInstr::FrameSetup);
+
+          SaveSize += SlotSize;
+        }
+
+	//dlg StackSize -= SaveSize;
+        //dlg MFI.setStackSize(StackSize);
+        X86FI->setSaveArgSize(SaveSize);
+      }
+
       if (NeedsWinFPO) {
         // .cv_fpo_setframe $FramePtr
         HasWinCFI = true;
@@ -1634,20 +1692,6 @@ void X86FrameLowering::emitEpilogue(MachineFunction &M
   }
   uint64_t SEHStackAllocAmt = NumBytes;
 
-  if (HasFP) {
-    // Pop EBP.
-    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
-            MachineFramePtr)
-        .setMIFlag(MachineInstr::FrameDestroy);
-    if (NeedsDwarfCFI) {
-      unsigned DwarfStackPtr =
-          TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
-      BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfa(
-                                  nullptr, DwarfStackPtr, -SlotSize));
-      --MBBI;
-    }
-  }
-
   MachineBasicBlock::iterator FirstCSPop = MBBI;
   // Skip the callee-saved pop instructions.
   while (MBBI != MBB.begin()) {
@@ -1717,6 +1761,28 @@ void X86FrameLowering::emitEpilogue(MachineFunction &M
     --MBBI;
   }
 
+  if (HasFP) {
+    MBBI = Terminator;
+
+    if (X86FI->getSaveArgSize()) {
+      // LEAVE is effectively mov rbp,rsp; pop rbp
+      BuildMI(MBB, MBBI, DL, TII.get(X86::LEAVE64))
+        .setMIFlag(MachineInstr::FrameDestroy);
+    } else {
+      // Pop EBP.
+      BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+              MachineFramePtr)
+          .setMIFlag(MachineInstr::FrameDestroy);
+    }
+    if (NeedsDwarfCFI) {
+      unsigned DwarfStackPtr =
+          TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
+      BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfa(
+                                  nullptr, DwarfStackPtr, -SlotSize));
+      --MBBI;
+    }
+  }
+
   // Windows unwinder will not invoke function's exception handler if IP is
   // either in prologue or in epilogue.  This behavior causes a problem when a
   // call immediately precedes an epilogue, because the return address points
@@ -1814,6 +1880,8 @@ int X86FrameLowering::getFrameIndexReference(const Mac
            "FPDelta isn't aligned per the Win64 ABI!");
   }
 
+  if (FI >= 0)
+    Offset -= X86FI->getSaveArgSize();
 
   if (TRI->hasBasePointer(MF)) {
     assert(HasFP && "VLAs and dynamic stack realign, but no FP?!");
@@ -3218,4 +3286,8 @@ void X86FrameLowering::processFunctionBeforeFrameFinal
   addFrameReference(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mi32)),
                     UnwindHelpFI)
       .addImm(-2);
+}
+
+const ReturnProtectorLowering *X86FrameLowering::getReturnProtector() const {
+  return &RPL;
 }
