/**************************************************************************/
/*                                                                        */
/*                                 OCaml                                  */
/*                                                                        */
/*             Xavier Leroy, projet Cristal, INRIA Rocquencourt           */
/*                                                                        */
/*   Copyright 2003 Institut National de Recherche en Informatique et     */
/*     en Automatique.                                                    */
/*                                                                        */
/*   All rights reserved.  This file is distributed under the terms of    */
/*   the GNU Lesser General Public License version 2.1, with the          */
/*   special exception on linking described in the file LICENSE.          */
/*                                                                        */
/**************************************************************************/

/* Asm part of the runtime system, AMD64 processor */
/* Must be preprocessed by cpp */

/* PIC mode support based on contribution by Paul Stravers (see PR#4795) */

#include "caml/m.h"

#if defined(SYS_macosx)

#define LBL(x) L##x
#define G(r) _##r
#define GREL(r) _##r@GOTPCREL
#define GCALL(r) _##r
#define TEXT_SECTION(name) .text
#define FUNCTION_ALIGN 2
#define EIGHT_ALIGN 3
#define SIXTEEN_ALIGN 4
#define FUNCTION(name) \
        .globl name; \
        .align FUNCTION_ALIGN; \
        name:

#elif defined(SYS_mingw64) || defined(SYS_cygwin)

#define LBL(x) .L##x
#define G(r) r
#undef  GREL
#define GCALL(r) r
#define TEXT_SECTION(name)
#define FUNCTION_ALIGN 4
#define EIGHT_ALIGN 8
#define SIXTEEN_ALIGN 16
#define FUNCTION(name) \
        TEXT_SECTION(name); \
        .globl name; \
        .align FUNCTION_ALIGN; \
        name:

#else /* Unix-like operating systems using ELF binaries */

#define LBL(x) .L##x
#define G(r) r
#define GREL(r) r@GOTPCREL
#define GCALL(r) r@PLT
#if defined(FUNCTION_SECTIONS)
#define TEXT_SECTION(name) .section .text.caml.##name,"ax",%progbits
#else
#define TEXT_SECTION(name)
#endif
#define FUNCTION_ALIGN 4
#define EIGHT_ALIGN 8
#define SIXTEEN_ALIGN 16
#define FUNCTION(name) \
        TEXT_SECTION(name); \
        .globl name; \
        .type name,@function; \
        .align FUNCTION_ALIGN; \
        name:

#endif

#if defined(SYS_linux) || defined(SYS_gnu)
#define ENDFUNCTION(name) \
        .size name, . - name
#else
#define ENDFUNCTION(name)
#endif

#include "../runtime/caml/asm.h"

#ifdef WITH_FRAME_POINTERS
#define FRAME_POINTER_SIZE   8
#else
#define FRAME_POINTER_SIZE   0
#endif

#define RETADDR_ENTRY_SIZE   (FRAME_POINTER_SIZE + 8) /* fp? + retaddr */

#ifdef WITH_FRAME_POINTERS
#define ENTER_FUNCTION                      \
        pushq   %rbp; CFI_ADJUST(8);        \
        movq    %rsp, %rbp;
#define LEAVE_FUNCTION                      \
        popq    %rbp; CFI_ADJUST(-8);
#else
#define ENTER_FUNCTION
#define LEAVE_FUNCTION
#endif

#define STACK_RETADDR_OFFSET(reg, ofs)  \
                            (FRAME_POINTER_SIZE + ofs)(reg)
#define STACK_RETADDR(reg)  STACK_RETADDR_OFFSET(reg, 0)
#define STACK_ARG_1(reg)    STACK_RETADDR_OFFSET(reg, 8)
#define STACK_ARG_2(reg)    STACK_RETADDR_OFFSET(reg, 16)


#ifdef DEBUG
#define IF_DEBUG(...) __VA_ARGS__
#else
#define IF_DEBUG(...)
#endif

/* struct stack_info */
#define Stack_sp                 0
#define Stack_exception          8
#define Stack_handler            16

/* struct stack_handler */
#define Handler_value(REG)       0(REG)
#define Handler_exception(REG)   8(REG)
#define Handler_effect(REG)     16(REG)
#define Handler_parent          24

/* struct c_stack_link */
#if defined(SYS_mingw64) || defined (SYS_cygwin)
#define Cstack_stack            32
#define Cstack_sp               40
#define Cstack_prev             48
#define SIZEOF_C_STACK_LINK     56
#else
#define Cstack_stack             0
#define Cstack_sp                8
#define Cstack_prev             16
#define SIZEOF_C_STACK_LINK     24
#endif

/******************************************************************************/
/* DWARF */
/******************************************************************************/

/* These amd64-specific register numbers are taken from
   Fig. 3.36 ("DWARF Register Number Mapping") of:

     System V Application Binary Interface
     AMD64 Architecture Processor Supplement
     Version 1.0
     https://github.com/hjl-tools/x86-psABI/wiki/x86-64-psABI-1.0.pdf */

#define DW_REG_rbx                3
#define DW_REG_rsp                7
#define DW_REG_r13                13

/******************************************************************************/
/* Access to the current domain state block. */
/******************************************************************************/

#define CAML_CONFIG_H_NO_TYPEDEFS
#include "../runtime/caml/config.h"

        .set    domain_curr_field, 0
#define DOMAIN_STATE(c_type, name) \
        .equ    domain_field_caml_##name, domain_curr_field ; \
        .set    domain_curr_field, domain_curr_field + 1
#include "../runtime/caml/domain_state.tbl"
#undef DOMAIN_STATE

#define Caml_state(var) (8*domain_field_caml_##var)(%r14)

/* Load address of global [label] in register [dst]. */
#if defined(__PIC__) && !defined(SYS_mingw64) && !defined(SYS_cygwin)
#define LEA_VAR(label,dst) \
        movq    GREL(label)(%rip), dst
#else
#define LEA_VAR(label,dst) \
        leaq    G(label)(%rip), dst
#endif

/******************************************************************************/
/* Stack switching operations */
/******************************************************************************/

/* Switch from OCaml to C stack. Clobbers %r10, %r11. */
#ifdef ASM_CFI_SUPPORTED
#define SWITCH_OCAML_TO_C_CFI                                   \
        CFI_REMEMBER_STATE;                                     \
          /* %rsp points to the c_stack_link. */                \
        .cfi_escape DW_CFA_def_cfa_expression, 5,               \
           DW_OP_breg + DW_REG_rsp, Cstack_sp, DW_OP_deref,     \
           DW_OP_plus_uconst, RETADDR_ENTRY_SIZE
#else
#define SWITCH_OCAML_TO_C_CFI
#endif
#define SWITCH_OCAML_TO_C                                  \
    /* Fill in Caml_state->current_stack->sp */            \
        movq    Caml_state(current_stack), %r10;           \
        movq    %rsp, Stack_sp(%r10);                      \
    /* Fill in Caml_state->c_stack */                      \
        movq    Caml_state(c_stack), %r11;                 \
        movq    %rsp, Cstack_sp(%r11);                     \
        movq    %r10, Cstack_stack(%r11);                  \
    /* Switch to C stack */                                \
        movq    %r11, %rsp;                                \
        SWITCH_OCAML_TO_C_CFI

/* Switch from C to OCaml stack.  Clobbers %r11. */
#define SWITCH_C_TO_OCAML                                           \
    /* Assert that %rsp == Caml_state->c_stack &&
        Caml_state->c_stack->sp == Caml_state->current_stack->sp */ \
        IF_DEBUG(cmpq %rsp, Caml_state(c_stack); je 8f; int3; 8:    \
                 movq Caml_state(current_stack), %r11;              \
                 movq Stack_sp(%r11), %r11;                         \
                 cmpq %r11, Cstack_sp(%rsp); je 8f; int3; 8:)       \
        movq    Cstack_sp(%rsp), %rsp;                              \
        CFI_RESTORE_STATE

/* Load Caml_state->exn_handler into %rsp and restores prior exn_handler.
   Clobbers %r11. */
#define RESTORE_EXN_HANDLER_OCAML              \
        movq    Caml_state(exn_handler), %rsp; \
        CFI_DEF_CFA_OFFSET(16);                \
        leaq    Caml_state(exn_handler), %r11; \
        popq    (%r11); CFI_ADJUST(-8)


/* When ThreadSanitizer instrumentation is enabled, the code must call
   the C functions __tsan_func_entry and __tsan_func_exit to signal
   function entries and exits. They follow the x86_64 calling
   convention of the platform. For efficiency reasons, we don't always
   save all caller-saved registers before calling them, but only the
   registers in use. */

/* Calls __tsan_func_entry on the current return address. [offset] is the
   distance from the current stack pointer to the saved return address, in
   bytes. May clobber all caller-saved registers. */
#if defined(WITH_THREAD_SANITIZER)
#define TSAN_ENTER_FUNCTION(offset)                             \
        /* arg 1: pc of raise */                                \
        movq    STACK_RETADDR_OFFSET(%rsp, offset), C_ARG_1;    \
        SWITCH_OCAML_TO_C;                                      \
        C_call  (GCALL(__tsan_func_entry));                     \
        SWITCH_C_TO_OCAML;

/* Calls __tsan_func_exit. May clobber all caller-saved registers. */
#define TSAN_EXIT_FUNCTION                                      \
        SWITCH_OCAML_TO_C;                                      \
        movq    $0, C_ARG_1;                                    \
        C_call  (GCALL(__tsan_func_exit));                      \
        SWITCH_C_TO_OCAML;
#else
#define TSAN_ENTER_FUNCTION(offset)
#define TSAN_EXIT_FUNCTION
#endif

/* Switch between OCaml stacks. Clobbers %r12.
   Expects old stack in %rsi and target stack in %r10.
   Leaves old stack in %rsi and target stack in %r10. */
#define SWITCH_OCAML_STACKS                               \
        ENTER_FUNCTION                                    \
    /* Save OCaml SP and exn_handler in the stack info */ \
        movq    %rsp, Stack_sp(%rsi);                     \
        movq    Caml_state(exn_handler), %r12;            \
        movq    %r12, Stack_exception(%rsi);              \
    /* switch stacks */                                   \
        movq    %r10, Caml_state(current_stack);          \
        movq    Stack_sp(%r10), %rsp;                     \
        CFI_DEF_CFA_OFFSET(8);                            \
    /* restore exn_handler for new stack */               \
        movq    Stack_exception(%r10), %r12;              \
        movq    %r12, Caml_state(exn_handler);            \
        LEAVE_FUNCTION

/* Updates the oldest saved frame pointer in the target fiber.

   A fiber stack may need to grow, causing the reallocation of the entire fiber,
   including stack_info and stack_handler structures.
   caml_try_realloc_stack will not be able to update the linked list of
   frame-pointers if it has been split (ie, in a continuation).
   caml_resume and caml_reperform use this macro to update the oldest saved rbp
   (highest one in the stack) in case the fiber was reallocated to reattach the
   frame-pointer linked list.

   REG: Stack_handler(target_fiber)

   The frame pointer will be pushed into the stack immediately after these
   instructions. The offset of the oldest saved rbp in a fiber from the stack
   handler is 48 = 4 words (caml_runstack) + 2 words (rip and rbp).
   */
#ifdef WITH_FRAME_POINTERS
#define UPDATE_BASE_POINTER(REG)                          \
        leaq    -8(%rsp), %r11;                           \
        movq    %r11, -48(REG)
#else
#define UPDATE_BASE_POINTER(REG)
#endif

/******************************************************************************/
/* Save and restore all callee-save registers on stack.
   Keep the stack 16-aligned. */
/******************************************************************************/

#if defined(SYS_mingw64) || defined(SYS_cygwin)

/* Win64 API: callee-save regs are rbx, rbp, rsi, rdi, r12-r15, xmm6-xmm15 */

#define PUSH_CALLEE_SAVE_REGS                               \
        pushq   %rbx; CFI_ADJUST (8); CFI_OFFSET(rbx, -16); \
        pushq   %rbp; CFI_ADJUST (8); CFI_OFFSET(rbp, -24); \
                    /* Allows debugger to walk the stack */ \
        pushq   %rsi; CFI_ADJUST (8); CFI_OFFSET(rsi, -32); \
        pushq   %rdi; CFI_ADJUST (8); CFI_OFFSET(rdi, -40); \
        pushq   %r12; CFI_ADJUST (8); CFI_OFFSET(r12, -48); \
        pushq   %r13; CFI_ADJUST (8); CFI_OFFSET(r13, -56); \
        pushq   %r14; CFI_ADJUST (8); CFI_OFFSET(r14, -64); \
        pushq   %r15; CFI_ADJUST (8); CFI_OFFSET(r15, -72); \
        subq    $(10*16), %rsp; CFI_ADJUST (10*16);         \
        movupd  %xmm6, 0*16(%rsp);                          \
        movupd  %xmm7, 1*16(%rsp);                          \
        movupd  %xmm8, 2*16(%rsp);                          \
        movupd  %xmm9, 3*16(%rsp);                          \
        movupd  %xmm10, 4*16(%rsp);                         \
        movupd  %xmm11, 5*16(%rsp);                         \
        movupd  %xmm12, 6*16(%rsp);                         \
        movupd  %xmm13, 7*16(%rsp);                         \
        movupd  %xmm14, 8*16(%rsp);                         \
        movupd  %xmm15, 9*16(%rsp)

#define POP_CALLEE_SAVE_REGS                                \
        movupd  0*16(%rsp), %xmm6;                          \
        movupd  1*16(%rsp), %xmm7;                          \
        movupd  2*16(%rsp), %xmm8;                          \
        movupd  3*16(%rsp), %xmm9;                          \
        movupd  4*16(%rsp), %xmm10;                         \
        movupd  5*16(%rsp), %xmm11;                         \
        movupd  6*16(%rsp), %xmm12;                         \
        movupd  7*16(%rsp), %xmm13;                         \
        movupd  8*16(%rsp), %xmm14;                         \
        movupd  9*16(%rsp), %xmm15;                         \
        addq    $(10*16), %rsp; CFI_ADJUST (-10*16);        \
        popq    %r15; CFI_ADJUST(-8); CFI_SAME_VALUE(r15);  \
        popq    %r14; CFI_ADJUST(-8); CFI_SAME_VALUE(r14);  \
        popq    %r13; CFI_ADJUST(-8); CFI_SAME_VALUE(r13);  \
        popq    %r12; CFI_ADJUST(-8); CFI_SAME_VALUE(r12);  \
        popq    %rdi; CFI_ADJUST(-8); CFI_SAME_VALUE(rdi);  \
        popq    %rsi; CFI_ADJUST(-8); CFI_SAME_VALUE(rsi);  \
        popq    %rbp; CFI_ADJUST(-8); CFI_SAME_VALUE(rbp);  \
        popq    %rbx; CFI_ADJUST(-8); CFI_SAME_VALUE(rbx)

#define CALLEE_SAVE_REGS_SIZE (8 * 8 + 10 * 16)

#else

/* Unix API: callee-save regs are rbx, rbp, r12-r15 */

#define PUSH_CALLEE_SAVE_REGS \
        pushq   %rbx; CFI_ADJUST(8); CFI_OFFSET(rbx, -16); \
        pushq   %rbp; CFI_ADJUST(8); CFI_OFFSET(rbp, -24); \
        pushq   %r12; CFI_ADJUST(8); CFI_OFFSET(r12, -32); \
        pushq   %r13; CFI_ADJUST(8); CFI_OFFSET(r13, -40); \
        pushq   %r14; CFI_ADJUST(8); CFI_OFFSET(r14, -48); \
        pushq   %r15; CFI_ADJUST(8); CFI_OFFSET(r15, -56)

#define POP_CALLEE_SAVE_REGS \
        popq    %r15; CFI_ADJUST(-8); CFI_SAME_VALUE(r15); \
        popq    %r14; CFI_ADJUST(-8); CFI_SAME_VALUE(r14); \
        popq    %r13; CFI_ADJUST(-8); CFI_SAME_VALUE(r13); \
        popq    %r12; CFI_ADJUST(-8); CFI_SAME_VALUE(r12); \
        popq    %rbp; CFI_ADJUST(-8); CFI_SAME_VALUE(rbp); \
        popq    %rbx; CFI_ADJUST(-8); CFI_SAME_VALUE(rbx)

#define CALLEE_SAVE_REGS_SIZE (6 * 8)

#endif

#define C_call(target) call target

/******************************************************************************/
/* Registers holding arguments of C functions. */
/******************************************************************************/

#if defined(SYS_mingw64) || defined(SYS_cygwin)
#define C_ARG_1 %rcx
#define C_ARG_2 %rdx
#define C_ARG_3 %r8
#define C_ARG_4 %r9
#else
#define C_ARG_1 %rdi
#define C_ARG_2 %rsi
#define C_ARG_3 %rdx
#define C_ARG_4 %rcx
#endif

        .text

#if defined(FUNCTION_SECTIONS)
        TEXT_SECTION(caml_hot.code_begin)
        .globl  G(caml_hot.code_begin)
G(caml_hot.code_begin):

        TEXT_SECTION(caml_hot.code_end)
        .globl  G(caml_hot.code_end)
G(caml_hot.code_end):
#endif

/******************************************************************************/
/* text section */
/******************************************************************************/

        TEXT_SECTION(caml_system__code_begin)
        .globl  G(caml_system__code_begin)
G(caml_system__code_begin):
        ret  /* just one instruction, so that debuggers don't display
        caml_system__code_begin instead of caml_call_gc */

/******************************************************************************/
/* Allocation */
/******************************************************************************/

/* Save all of the registers that may be in use to a free gc_regs bucket.
   Returns: bucket in %r15. Clobbers %r11 (after saving it) */
#define SAVE_ALL_REGS                                  \
    /* First, save the young_ptr. */                   \
        movq    %r15, Caml_state(young_ptr);           \
    /* Now, use %r15 to point to the gc_regs bucket */ \
    /* We save %r11 first to allow it to be scratch */ \
        movq    Caml_state(gc_regs_buckets), %r15;     \
        movq    %r11, 11*8(%r15);                      \
        movq    0(%r15), %r11; /* next ptr */          \
        movq    %r11, Caml_state(gc_regs_buckets);     \
        movq    %rax,         0*8(%r15);               \
        movq    %rbx,         1*8(%r15);               \
        movq    %rdi,         2*8(%r15);               \
        movq    %rsi,         3*8(%r15);               \
        movq    %rdx,         4*8(%r15);               \
        movq    %rcx,         5*8(%r15);               \
        movq    %r8,          6*8(%r15);               \
        movq    %r9,          7*8(%r15);               \
        movq    %r12,         8*8(%r15);               \
        movq    %r13,         9*8(%r15);               \
        movq    %r10,        10*8(%r15);               \
             /* %r11 is at   11*8(%r15); */            \
        movq    %rbp,        12*8(%r15);               \
        movsd   %xmm0,   (0+13)*8(%r15);               \
        movsd   %xmm1,   (1+13)*8(%r15);               \
        movsd   %xmm2,   (2+13)*8(%r15);               \
        movsd   %xmm3,   (3+13)*8(%r15);               \
        movsd   %xmm4,   (4+13)*8(%r15);               \
        movsd   %xmm5,   (5+13)*8(%r15);               \
        movsd   %xmm6,   (6+13)*8(%r15);               \
        movsd   %xmm7,   (7+13)*8(%r15);               \
        movsd   %xmm8,   (8+13)*8(%r15);               \
        movsd   %xmm9,   (9+13)*8(%r15);               \
        movsd   %xmm10, (10+13)*8(%r15);               \
        movsd   %xmm11, (11+13)*8(%r15);               \
        movsd   %xmm12, (12+13)*8(%r15);               \
        movsd   %xmm13, (13+13)*8(%r15);               \
        movsd   %xmm14, (14+13)*8(%r15);               \
        movsd   %xmm15, (15+13)*8(%r15)

/* Undo SAVE_ALL_REGS. Expects gc_regs bucket in %r15 */
#define RESTORE_ALL_REGS                               \
    /* Restore %rax, freeing up the next ptr slot */   \
        movq    0*8(%r15), %rax;                       \
        movq    Caml_state(gc_regs_buckets), %r11;     \
        movq    %r11, 0(%r15); /* next ptr */          \
        movq    %r15, Caml_state(gc_regs_buckets);     \
           /* above:  0*8(%r15),%rax; */               \
        movq          1*8(%r15),%rbx;                  \
        movq          2*8(%r15),%rdi;                  \
        movq          3*8(%r15),%rsi;                  \
        movq          4*8(%r15),%rdx;                  \
        movq          5*8(%r15),%rcx;                  \
        movq          6*8(%r15),%r8;                   \
        movq          7*8(%r15),%r9;                   \
        movq          8*8(%r15),%r12;                  \
        movq          9*8(%r15),%r13;                  \
        movq         10*8(%r15),%r10;                  \
        movq         11*8(%r15),%r11;                  \
        movq         12*8(%r15),%rbp;                  \
        movsd    (0+13)*8(%r15),%xmm0;                 \
        movsd    (1+13)*8(%r15),%xmm1;                 \
        movsd    (2+13)*8(%r15),%xmm2;                 \
        movsd    (3+13)*8(%r15),%xmm3;                 \
        movsd    (4+13)*8(%r15),%xmm4;                 \
        movsd    (5+13)*8(%r15),%xmm5;                 \
        movsd    (6+13)*8(%r15),%xmm6;                 \
        movsd    (7+13)*8(%r15),%xmm7;                 \
        movsd    (8+13)*8(%r15),%xmm8;                 \
        movsd    (9+13)*8(%r15),%xmm9;                 \
        movsd   (10+13)*8(%r15),%xmm10;                \
        movsd   (11+13)*8(%r15),%xmm11;                \
        movsd   (12+13)*8(%r15),%xmm12;                \
        movsd   (13+13)*8(%r15),%xmm13;                \
        movsd   (14+13)*8(%r15),%xmm14;                \
        movsd   (15+13)*8(%r15),%xmm15;                \
        movq    Caml_state(young_ptr), %r15

/* Save non-callee saved registers that may be in use to a free gc_regs
   bucket, except for %xmm8 to %xmm15 as they are not used to pass
   arguments and return values in the System V amd64 ABI.
   Returns: bucket in %r15. Clobbers %r11 (after saving it) */
#ifdef WITH_THREAD_SANITIZER
#define TSAN_SAVE_CALLER_REGS                               \
    /* First, save the young_ptr. */                   \
        movq    %r15, Caml_state(young_ptr);           \
    /* Now, use %r15 to point to the gc_regs bucket */ \
    /* We save %r11 first to allow it to be scratch */ \
        movq    Caml_state(gc_regs_buckets), %r15;     \
        movq    %r11, 11*8(%r15);                      \
        movq    0(%r15), %r11; /* next ptr */          \
        movq    %r11, Caml_state(gc_regs_buckets);     \
        movq    %rax,         0*8(%r15);               \
        movq    %rdi,         2*8(%r15);               \
        movq    %rsi,         3*8(%r15);               \
        movq    %rdx,         4*8(%r15);               \
        movq    %rcx,         5*8(%r15);               \
        movq    %r8,          6*8(%r15);               \
        movq    %r9,          7*8(%r15);               \
        movq    %r10,        10*8(%r15);               \
             /* %r11 is at   11*8(%r15); */            \
        movsd   %xmm0,   (0+13)*8(%r15);               \
        movsd   %xmm1,   (1+13)*8(%r15);               \
        movsd   %xmm2,   (2+13)*8(%r15);               \
        movsd   %xmm3,   (3+13)*8(%r15);               \
        movsd   %xmm4,   (4+13)*8(%r15);               \
        movsd   %xmm5,   (5+13)*8(%r15);               \
        movsd   %xmm6,   (6+13)*8(%r15);               \
        movsd   %xmm7,   (7+13)*8(%r15);
#else
#define TSAN_SAVE_CALLER_REGS
#endif

/* Restore registers saved by TSAN_SAVE_CALLER_REGS. Expects gc_regs bucket in
   %r15 */
#ifdef WITH_THREAD_SANITIZER
#define TSAN_RESTORE_CALLER_REGS                            \
    /* Restore %rax, freeing up the next ptr slot */   \
        movq    0*8(%r15), %rax;                       \
        movq    Caml_state(gc_regs_buckets), %r11;     \
        movq    %r11, 0(%r15); /* next ptr */          \
        movq    %r15, Caml_state(gc_regs_buckets);     \
           /* above:  0*8(%r15),%rax; */               \
        movq          2*8(%r15),%rdi;                  \
        movq          3*8(%r15),%rsi;                  \
        movq          4*8(%r15),%rdx;                  \
        movq          5*8(%r15),%rcx;                  \
        movq          6*8(%r15),%r8;                   \
        movq          7*8(%r15),%r9;                   \
        movq         10*8(%r15),%r10;                  \
        movq         11*8(%r15),%r11;                  \
        movsd    (0+13)*8(%r15),%xmm0;                 \
        movsd    (1+13)*8(%r15),%xmm1;                 \
        movsd    (2+13)*8(%r15),%xmm2;                 \
        movsd    (3+13)*8(%r15),%xmm3;                 \
        movsd    (4+13)*8(%r15),%xmm4;                 \
        movsd    (5+13)*8(%r15),%xmm5;                 \
        movsd    (6+13)*8(%r15),%xmm6;                 \
        movsd    (7+13)*8(%r15),%xmm7;                 \
        movq    Caml_state(young_ptr), %r15
#else
#define TSAN_RESTORE_CALLER_REGS
#endif

FUNCTION(G(caml_call_realloc_stack))
CFI_STARTPROC
        CFI_SIGNAL_FRAME
        ENTER_FUNCTION
        SAVE_ALL_REGS
        movq    RETADDR_ENTRY_SIZE(%rsp), C_ARG_1 /* argument */
        SWITCH_OCAML_TO_C
        C_call  (GCALL(caml_try_realloc_stack))
        SWITCH_C_TO_OCAML
        cmpq    $0, %rax
        jz      1f
        RESTORE_ALL_REGS
        LEAVE_FUNCTION
        ret
1:      RESTORE_ALL_REGS
        LEA_VAR(caml_exn_Stack_overflow, %rax)
        add     $16, %rsp /* pop argument, retaddr */
        jmp     GCALL(caml_raise_exn)
CFI_ENDPROC
ENDFUNCTION(G(caml_call_realloc_stack))

FUNCTION(G(caml_call_gc))
CFI_STARTPROC
        CFI_SIGNAL_FRAME
        ENTER_FUNCTION
LBL(caml_call_gc):
        SAVE_ALL_REGS
        movq    %r15, Caml_state(gc_regs)
        TSAN_ENTER_FUNCTION(0)
        SWITCH_OCAML_TO_C
        C_call (GCALL(caml_garbage_collection))
        SWITCH_C_TO_OCAML
        TSAN_EXIT_FUNCTION
        movq    Caml_state(gc_regs), %r15
        RESTORE_ALL_REGS
        LEAVE_FUNCTION
        ret
CFI_ENDPROC
ENDFUNCTION(G(caml_call_gc))

FUNCTION(G(caml_alloc1))
CFI_STARTPROC
        ENTER_FUNCTION
        subq    $16, %r15
        cmpq    Caml_state(young_limit), %r15
        jb      LBL(caml_call_gc)
        LEAVE_FUNCTION
        ret
CFI_ENDPROC
ENDFUNCTION(G(caml_alloc1))

FUNCTION(G(caml_alloc2))
CFI_STARTPROC
        ENTER_FUNCTION
        subq    $24, %r15
        cmpq    Caml_state(young_limit), %r15
        jb      LBL(caml_call_gc)
        LEAVE_FUNCTION
        ret
CFI_ENDPROC
ENDFUNCTION(G(caml_alloc2))

FUNCTION(G(caml_alloc3))
CFI_STARTPROC
        ENTER_FUNCTION
        subq    $32, %r15
        cmpq    Caml_state(young_limit), %r15
        jb      LBL(caml_call_gc)
        LEAVE_FUNCTION
        ret
CFI_ENDPROC
ENDFUNCTION(G(caml_alloc3))

FUNCTION(G(caml_allocN))
CFI_STARTPROC
        ENTER_FUNCTION
        cmpq    Caml_state(young_limit), %r15
        jb      LBL(caml_call_gc)
        LEAVE_FUNCTION
        ret
CFI_ENDPROC
ENDFUNCTION(G(caml_allocN))

/******************************************************************************/
/* Call a C function from OCaml */
/******************************************************************************/

/* Update [young_limit] when returning from non-noalloc extern calls.
   Here is C code that can be used to generate RET_FROM_C_CALL for a
   new back-end.

     #include <stdatomic.h>
     #include <stdint.h>

     typedef struct { _Atomic(uint64_t) young_limit;
                      int64_t action_pending; } caml_domain_state;

     void ret_from_c_call(caml_domain_state *dom_st)
     {
       if (__builtin_expect(dom_st->action_pending, 0))
         atomic_store_explicit(&dom_st->young_limit, (uint64_t)-1,
                               memory_order_relaxed);
     }

*/
#define RET_FROM_C_CALL                         \
        /* Test the least-significant byte of action_pending */ \
        cmpb    $0, Caml_state(action_pending); \
        jne     1f;                             \
        ret;                                    \
1:      movq    $-1, Caml_state(young_limit);   \
        ret

FUNCTION(G(caml_c_call))
CFI_STARTPROC
        CFI_SIGNAL_FRAME
        ENTER_FUNCTION
        TSAN_SAVE_CALLER_REGS
        TSAN_ENTER_FUNCTION(0)
        TSAN_RESTORE_CALLER_REGS
LBL(caml_c_call):
    /* Arguments:
        C arguments         : %rdi, %rsi, %rdx, %rcx, %r8, and %r9
        C function          : %rax */
    /* Switch from OCaml to C */
        SWITCH_OCAML_TO_C
    /* Make the alloc ptr available to the C code */
        movq    %r15, Caml_state(young_ptr)
    /* Call the function (address in %rax) */
        C_call  (*%rax)
    /* Prepare for return to OCaml */
        movq    Caml_state(young_ptr), %r15
    /* Load ocaml stack and restore global variables */
        SWITCH_C_TO_OCAML
#ifdef WITH_THREAD_SANITIZER
    /* Save non-callee-saved registers %rax and %xmm0 before C call */
        pushq   %rax; CFI_ADJUST(8);
        subq    $16, %rsp; CFI_ADJUST(16);
        movupd   %xmm0, (%rsp)
        TSAN_EXIT_FUNCTION
        movupd  (%rsp), %xmm0
        addq    $16, %rsp; CFI_ADJUST(-16);
        popq    %rax; CFI_ADJUST(-8);
#endif
        LEAVE_FUNCTION
    /* Return to OCaml caller */
        RET_FROM_C_CALL
CFI_ENDPROC
ENDFUNCTION(G(caml_c_call))

FUNCTION(G(caml_c_call_stack_args))
CFI_STARTPROC
        CFI_SIGNAL_FRAME
        ENTER_FUNCTION
    /* Arguments:
        C arguments         : %rdi, %rsi, %rdx, %rcx, %r8, and %r9
        C function          : %rax
        C stack args        : begin=%r13 end=%r12 */
    /* Switch from OCaml to C */
        SWITCH_OCAML_TO_C
    /* we use %rbx (otherwise unused) to enable backtraces */
        movq    %rsp, %rbx
#ifdef ASM_CFI_SUPPORTED
        .cfi_escape DW_CFA_def_cfa_expression, 5,           \
          /* %rbp points to the c_stack_link structure */   \
          DW_OP_breg + DW_REG_rbx, Cstack_sp, DW_OP_deref,  \
          DW_OP_plus_uconst, RETADDR_ENTRY_SIZE
#endif
    /* Make the alloc ptr available to the C code */
        movq    %r15, Caml_state(young_ptr)
    /* Copy arguments from OCaml to C stack */
#if defined(SYS_mingw64) || defined (SYS_cygwin)
        addq    $32, %rsp
#endif
LBL(105):
        subq    $8, %r12
        cmpq    %r13,%r12
        jb      LBL(106)
        push    (%r12); CFI_ADJUST(8)
        jmp     LBL(105)
LBL(106):
#if defined(SYS_mingw64) || defined (SYS_cygwin)
        subq    $32, %rsp
#endif
    /* Call the function (address in %rax) */
        C_call  (*%rax)
    /* Pop arguments back off the stack */
        movq    Caml_state(c_stack), %rsp
    /* Prepare for return to OCaml */
        movq    Caml_state(young_ptr), %r15
    /* Load ocaml stack and restore global variables */
        SWITCH_C_TO_OCAML
    /* Return to OCaml caller */
        LEAVE_FUNCTION
        RET_FROM_C_CALL
CFI_ENDPROC
ENDFUNCTION(G(caml_c_call_stack_args))

/******************************************************************************/
/* Start the OCaml program */
/******************************************************************************/

FUNCTION(G(caml_start_program))
CFI_STARTPROC
        CFI_SIGNAL_FRAME
    /* Save callee-save registers */
        PUSH_CALLEE_SAVE_REGS
#if defined(WITH_THREAD_SANITIZER)
    /* We can't use the TSAN_ENTER_FUNCTION macro as it assumes an OCaml stack,
       and we are on a C stack. */
        /* Save C_ARG_1 before C call */
        pushq   C_ARG_1; CFI_ADJUST(8)
        /* Read return address */
        movq    (8 + CALLEE_SAVE_REGS_SIZE)(%rsp), C_ARG_1
        C_call  (GCALL(__tsan_func_entry))
        popq    C_ARG_1; CFI_ADJUST(-8)
#endif
    /* Load Caml_state into r14 (was passed as an argument from C) */
        movq    C_ARG_1, %r14
    /* Initial entry point is G(caml_program) */
        LEA_VAR(caml_program, %r12)
#ifdef DEBUG
        movq    $0, %rax  /* dummy */
        movq    $0, %rbx  /* dummy */
        movq    $0, %rdi  /* dummy */
        movq    $0, %rsi  /* dummy */
#endif
    /* Common code for caml_start_program and caml_callback* */
LBL(caml_start_program):
    /* Load young_ptr into %r15 */
        movq    Caml_state(young_ptr), %r15
    /* Build struct c_stack_link on the C stack */
        subq    $SIZEOF_C_STACK_LINK, %rsp; CFI_ADJUST(SIZEOF_C_STACK_LINK)
        movq    $0, Cstack_stack(%rsp)
        movq    $0, Cstack_sp(%rsp)
        movq    Caml_state(c_stack), %r10
        movq    %r10, Cstack_prev(%rsp)
        movq    %rsp, Caml_state(c_stack)
    /* Load the OCaml stack. */
        movq    Caml_state(current_stack), %r11
        movq    Stack_sp(%r11), %r10
    /* Store the stack pointer to allow DWARF unwind */
        subq    $16, %r10
        movq    %rsp, 0(%r10) /* C_STACK_SP */
    /* Store the gc_regs for callbacks during a GC */
        movq    Caml_state(gc_regs), %r11
        movq    %r11, 8(%r10)
    /* Build a handler for exceptions raised in OCaml on the OCaml stack. */
        subq    $16, %r10
        lea     LBL(109)(%rip), %r11
        movq    %r11, 8(%r10)
    /* link in the previous exn_handler so that copying stacks works */
        movq    Caml_state(exn_handler), %r11
        movq    %r11, 0(%r10)
        movq    %r10, Caml_state(exn_handler)
    /* Switch stacks and call the OCaml code */
        movq    %r10, %rsp
#ifdef ASM_CFI_SUPPORTED
        CFI_REMEMBER_STATE
        .cfi_escape DW_CFA_def_cfa_expression, 3 + 2,                 \
            /* %rsp points to the exn handler on the OCaml stack */   \
            /* %rsp + 16 contains the C_STACK_SP */                   \
          DW_OP_breg + DW_REG_rsp, 16 /* exn handler */, DW_OP_deref, \
          DW_OP_plus_uconst,                                          \
             24  /* struct c_stack_link */ +                          \
             6*8 /* callee save regs */ +                             \
             8   /* ret addr */
#endif
        call    *%r12
LBL(108):
    /* pop exn handler */
        movq    0(%rsp), %r11
        movq    %r11, Caml_state(exn_handler)
        leaq    16(%rsp), %r10
1:  /* restore GC regs */
        movq    8(%r10), %r11
        movq    %r11, Caml_state(gc_regs)
        addq    $16, %r10
    /* Update alloc ptr */
        movq    %r15, Caml_state(young_ptr)
    /* Return to C stack. */
        movq    Caml_state(current_stack), %r11
        movq    %r10, Stack_sp(%r11)
        movq    Caml_state(c_stack), %rsp
        CFI_RESTORE_STATE
    /* Pop the struct c_stack_link */
        movq    Cstack_prev(%rsp), %r10
        movq    %r10, Caml_state(c_stack)
        addq    $SIZEOF_C_STACK_LINK, %rsp; CFI_ADJUST(-SIZEOF_C_STACK_LINK)
#if defined(WITH_THREAD_SANITIZER)
    /* We can't use the TSAN_EXIT_FUNCTION macro as it assumes an OCaml stack,
       and we are on a C stack. */
        /* Save %rax before C call */
        pushq   %rax; CFI_ADJUST(8)
        movq    $0, C_ARG_1
        C_call  (GCALL(__tsan_func_exit))
        popq    %rax; CFI_ADJUST(-8)
#endif
    /* Restore callee-save registers. */
        POP_CALLEE_SAVE_REGS
    /* Return to caller. */
        ret
LBL(109):
    /* Exception handler*/
    /* Mark the bucket as an exception result and return it */
        orq     $2, %rax
        /* exn handler already popped here */
        movq    %rsp, %r10
        jmp     1b
CFI_ENDPROC
ENDFUNCTION(G(caml_start_program))

/******************************************************************************/
/* Exceptions */
/******************************************************************************/

/* Raise an exception from OCaml */

FUNCTION(G(caml_raise_exn))
CFI_STARTPROC
        ENTER_FUNCTION
LBL(caml_raise_exn):
        testq   $1, Caml_state(backtrace_active)
        jne   LBL(116)
        RESTORE_EXN_HANDLER_OCAML
        ret
LBL(116):
        movq    $0, Caml_state(backtrace_pos)
LBL(117):
        movq    %rsp, %r13        /* Save OCaml stack pointer */
        movq    %rax, %r12        /* Save exception bucket */
        movq    Caml_state(c_stack), %rsp
        movq    %rax, C_ARG_1                    /* arg 1: exception bucket */
        movq    STACK_RETADDR(%r13), C_ARG_2     /* arg 2: pc of raise */
        leaq    STACK_ARG_1(%r13), C_ARG_3       /* arg 3: sp at raise */
        movq    Caml_state(exn_handler), C_ARG_4 /* arg 4: sp of handler */
        C_call  (GCALL(caml_stash_backtrace))
        movq    %r12, %rax        /* Recover exception bucket */
        RESTORE_EXN_HANDLER_OCAML
        ret
CFI_ENDPROC
ENDFUNCTION(G(caml_raise_exn))

#if defined(WITH_THREAD_SANITIZER)
/* When TSan support is enabled, this routine should be called just before
   raising an exception. It calls __tsan_func_exit for every OCaml frame about
   to be exited due to the exception.
   Takes no arguments, clobbers C_ARG_1, C_ARG_2, C_ARG_3 and potentially all
   caller-saved registers of the C calling convention. */
FUNCTION(G(caml_tsan_exit_on_raise_asm))
CFI_STARTPROC
        ENTER_FUNCTION
        movq    STACK_RETADDR(%rsp), C_ARG_1   /* arg 1: pc of raise */
        leaq    STACK_ARG_1(%rsp), C_ARG_2  /* arg 2: sp at raise */
        movq    Caml_state(exn_handler), C_ARG_3 /* arg 3: sp of handler */
        SWITCH_OCAML_TO_C
        C_call  (GCALL(caml_tsan_exit_on_raise))
        SWITCH_C_TO_OCAML
        LEAVE_FUNCTION
        ret
CFI_ENDPROC
ENDFUNCTION(G(caml_tsan_exit_on_raise_asm))
#endif

FUNCTION(G(caml_reraise_exn))
CFI_STARTPROC
        ENTER_FUNCTION
        testq   $1, Caml_state(backtrace_active)
        jne   LBL(117)
        RESTORE_EXN_HANDLER_OCAML
        ret
CFI_ENDPROC
ENDFUNCTION(G(caml_reraise_exn))

/* Raise an exception from C */

FUNCTION(G(caml_raise_exception))
CFI_STARTPROC
        ENTER_FUNCTION
        movq    C_ARG_1, %r14                /* Caml_state */
        movq    C_ARG_2, %rax
    /* Load young_ptr into %r15 */
        movq    Caml_state(young_ptr), %r15
    /* Discard the C stack pointer and reset to ocaml stack */
        movq    Caml_state(current_stack), %r10
        movq    Stack_sp(%r10), %rsp         /* FIXME: CFI */
#if defined(WITH_THREAD_SANITIZER)
    /* Call __tsan_func_exit for every OCaml stack frame exited due to the
       exception */
        movq    STACK_RETADDR(%rsp), C_ARG_1   /* arg 1: pc of raise */
        leaq    STACK_ARG_1(%rsp), C_ARG_2  /* arg 2: sp at raise */
        movq    Caml_state(exn_handler), C_ARG_3 /* arg 3: sp of handler */
        pushq   %rax; CFI_ADJUST(8)
        SWITCH_OCAML_TO_C
        C_call  (GCALL(caml_tsan_exit_on_raise))
        SWITCH_C_TO_OCAML
        popq    %rax; CFI_ADJUST(-8)
#endif
        jmp LBL(caml_raise_exn)
CFI_ENDPROC
ENDFUNCTION(G(caml_raise_exception))

/******************************************************************************/
/* Callback from C to OCaml */
/******************************************************************************/

FUNCTION(G(caml_callback_asm))
CFI_STARTPROC
#if defined(WITH_THREAD_SANITIZER)
    /* Save non-callee-saved registers C_ARG_1, C_ARG_2, C_ARG_3 before C call
     */
        pushq   C_ARG_1; CFI_ADJUST(8)
        pushq   C_ARG_2; CFI_ADJUST(8)
        pushq   C_ARG_3; CFI_ADJUST(8)
        movq    24(%rsp), C_ARG_1 /* Read return address */
        C_call  (GCALL(__tsan_func_entry))
        popq    C_ARG_3; CFI_ADJUST(-8)
        popq    C_ARG_2; CFI_ADJUST(-8)
        popq    C_ARG_1; CFI_ADJUST(-8)
#endif
    /* Save callee-save registers */
        PUSH_CALLEE_SAVE_REGS
    /* Initial loading of arguments */
        movq    C_ARG_1, %r14      /* Caml_state */
        movq    C_ARG_2, %rbx      /* closure */
        movq    0(C_ARG_3), %rax   /* argument */
        movq    0(%rbx), %r12      /* code pointer */
        movq    $0, %rdi           /* dummy */
        movq    $0, %rsi           /* dummy */
        jmp     LBL(caml_start_program)
CFI_ENDPROC
ENDFUNCTION(G(caml_callback_asm))

FUNCTION(G(caml_callback2_asm))
CFI_STARTPROC
#if defined(WITH_THREAD_SANITIZER)
    /* Save non-callee-saved registers C_ARG_1, C_ARG_2, C_ARG_3 before C call
     */
        pushq   C_ARG_1; CFI_ADJUST(8)
        pushq   C_ARG_2; CFI_ADJUST(8)
        pushq   C_ARG_3; CFI_ADJUST(8)
        movq    24(%rsp), C_ARG_1 /* Read return address */
        C_call  (GCALL(__tsan_func_entry))
        popq    C_ARG_3; CFI_ADJUST(-8)
        popq    C_ARG_2; CFI_ADJUST(-8)
        popq    C_ARG_1; CFI_ADJUST(-8)
#endif
    /* Save callee-save registers */
        PUSH_CALLEE_SAVE_REGS
    /* Initial loading of arguments */
        movq    C_ARG_1, %r14      /* Caml_state */
        movq    C_ARG_2, %rdi      /* closure */
        movq    0(C_ARG_3), %rax   /* first argument */
        movq    8(C_ARG_3), %rbx   /* second argument */
        LEA_VAR(caml_apply2, %r12) /* code pointer */
        movq    $0, %rsi           /* dummy */
        jmp     LBL(caml_start_program)
CFI_ENDPROC
ENDFUNCTION(G(caml_callback2_asm))

FUNCTION(G(caml_callback3_asm))
CFI_STARTPROC
#if defined(WITH_THREAD_SANITIZER)
    /* Save non-callee-saved registers C_ARG_1, C_ARG_2, C_ARG_3 before C call
     */
        pushq   C_ARG_1; CFI_ADJUST(8)
        pushq   C_ARG_2; CFI_ADJUST(8)
        pushq   C_ARG_3; CFI_ADJUST(8)
        movq    24(%rsp), C_ARG_1 /* Read return address */
        C_call  (GCALL(__tsan_func_entry))
        popq    C_ARG_3; CFI_ADJUST(-8)
        popq    C_ARG_2; CFI_ADJUST(-8)
        popq    C_ARG_1; CFI_ADJUST(-8)
#endif
    /* Save callee-save registers */
        PUSH_CALLEE_SAVE_REGS
    /* Initial loading of arguments */
        movq    C_ARG_1, %r14      /* Caml_state */
        movq    0(C_ARG_3), %rax   /* first argument */
        movq    8(C_ARG_3), %rbx   /* second argument */
        movq    C_ARG_2, %rsi      /* closure */
        movq    16(C_ARG_3), %rdi  /* third argument */
        LEA_VAR(caml_apply3, %r12) /* code pointer */
        jmp     LBL(caml_start_program)
CFI_ENDPROC
ENDFUNCTION(G(caml_callback3_asm))

/******************************************************************************/
/* Fibers */
/*
 * A continuation is a one word object that points to a fiber. A fiber [f] will
 * point to its parent at Handler_parent(Stack_handler(f)). In the following,
 * the [last_fiber] refers to the last fiber in the linked-list formed by the
 * parent pointer.
 */
/******************************************************************************/

FUNCTION(G(caml_perform))
CFI_STARTPROC
    /*  %rax: effect to perform
        %rbx: freshly allocated continuation */
        movq    Caml_state(current_stack), %rsi /* %rsi := old stack */
        leaq    1(%rsi), %rdi /* %rdi (last_fiber) := Val_ptr(old stack) */
        movq    %rdi, 0(%rbx) /* Initialise continuation */
LBL(do_perform):
    /*  %rdi: last_fiber
        %rsi: old stack */
#if defined(WITH_THREAD_SANITIZER)
    /* Signal to TSan all stack frames exited by the perform. */
        ENTER_FUNCTION
        TSAN_SAVE_CALLER_REGS
        movq    STACK_RETADDR(%rsp), C_ARG_1  /* arg 1: pc of perform */
        leaq    STACK_ARG_1(%rsp), C_ARG_2    /* arg 2: sp at perform */
        SWITCH_OCAML_TO_C
        C_call  (GCALL(caml_tsan_exit_on_perform))
        SWITCH_C_TO_OCAML
        TSAN_RESTORE_CALLER_REGS
        LEAVE_FUNCTION
#endif
        movq    %rdi, 8(%rbx) /* Set the last fiber field in the continuation */
        movq    Stack_handler(%rsi), %r11  /* %r11 := old stack -> handler */
        movq    Handler_parent(%r11), %r10 /* %r10 := parent stack */
        cmpq    $0, %r10                   /* parent is NULL? */
        je      LBL(112)
#if defined(WITH_THREAD_SANITIZER)
     /* Save non-callee-saved registers %rax, %rdi, %rsi, %r10, %r11 before C
        call */
        pushq   %rax; CFI_ADJUST(8);
        pushq   %rdi; CFI_ADJUST(8);
        pushq   %rsi; CFI_ADJUST(8);
        pushq   %r10; CFI_ADJUST(8);
        pushq   %r11; CFI_ADJUST(8);
     /* Match the TSan-enter made from caml_runstack */
        TSAN_EXIT_FUNCTION
        popq   %r11; CFI_ADJUST(-8);
        popq   %r10; CFI_ADJUST(-8);
        popq   %rsi; CFI_ADJUST(-8);
        popq   %rdi; CFI_ADJUST(-8);
        popq   %rax; CFI_ADJUST(-8);
#endif
        SWITCH_OCAML_STACKS /* preserves r11 and rsi */
     /* We have to null the Handler_parent after the switch because the
        Handler_parent is needed to unwind the stack for backtraces */
        movq    $0, Handler_parent(%r11) /* Set parent of performer to NULL */
        movq    Handler_effect(%r11), %rsi  /* %rsi := effect handler */
        jmp     GCALL(caml_apply3)
LBL(112):
    /* Switch back to original performer before raising Effect.Unhandled
       (no-op unless this is a reperform) */
        movq    0(%rbx), %r10  /* load performer stack from continuation */
        subq    $1, %r10       /* r10 := Ptr_val(r10) */
        movq    Caml_state(current_stack), %rsi
        SWITCH_OCAML_STACKS
    /* No parent stack. Raise Effect.Unhandled. */
        ENTER_FUNCTION
#if defined(WITH_THREAD_SANITIZER)
        /* We must let the TSan runtime know that we switched back to the
           original performer stack. For that, we perform the necessary calls
           to __tsan_func_entry via caml_tsan_entry_on_resume.
           Note that from TSan's point of view, we just exited all stack
           frames, including those of the main fiber. This is ok, because we
           re-enter them immediately via caml_tsan_entry_on_resume below. */
        TSAN_SAVE_CALLER_REGS
        movq    STACK_RETADDR(%rsp), C_ARG_1       /* arg 1: pc of perform */
        leaq    STACK_ARG_1(%rsp), C_ARG_2         /* arg 2: sp at perform */
        movq    %r10, C_ARG_3                      /* arg 3: performer stack */
        SWITCH_OCAML_TO_C
        C_call  (GCALL(caml_tsan_entry_on_resume))
        SWITCH_C_TO_OCAML
        TSAN_ENTER_FUNCTION(0)
        TSAN_RESTORE_CALLER_REGS
#endif
        movq    %rax, C_ARG_1
        LEA_VAR(caml_raise_unhandled_effect, %rax)
        jmp     LBL(caml_c_call)
CFI_ENDPROC
ENDFUNCTION(G(caml_perform))

FUNCTION(G(caml_reperform))
CFI_STARTPROC
    /*  %rax: effect to reperform
        %rbx: continuation
        %rdi: last_fiber */
        movq    Caml_state(current_stack), %rsi  /* %rsi := old stack */
        movq    (Stack_handler-1)(%rdi), %r10
        movq    %rsi, Handler_parent(%r10)       /* Append to last_fiber */
        leaq    1(%rsi), %rdi  /* %rdi (last_fiber) := Val_ptr(old stack) */
     /* Need to update the oldest saved frame pointer here as the execution of
        the handler may have caused the current fiber stack to reallocate. */
        UPDATE_BASE_POINTER(%r10)
        jmp     LBL(do_perform)
CFI_ENDPROC
ENDFUNCTION(G(caml_reperform))

FUNCTION(G(caml_resume))
CFI_STARTPROC
    /* %rax -> fiber, %rbx -> fun, %rdi -> arg, %rsi -> last_fiber */
        leaq    -1(%rax), %r10  /* %r10 (new stack) = Ptr_val(%rax) */
        movq    %rdi, %rax      /* %rax := argument to the function in %rbx */
    /*  check if stack null, then already used */
        testq   %r10, %r10
        jz      1f
#if defined(WITH_THREAD_SANITIZER)
    /* Save non-callee-saved registers %rax, %rsi and %r10 before C call */
        ENTER_FUNCTION
        pushq   %rax; CFI_ADJUST(8);
        pushq   %rsi; CFI_ADJUST(8);
        pushq   %r10; CFI_ADJUST(8);
    /* Necessary to include the caller of caml_resume in the TSan backtrace */
        TSAN_ENTER_FUNCTION(24)
        popq    %r10; CFI_ADJUST(-8);
        popq    %rsi; CFI_ADJUST(-8);
        popq    %rax; CFI_ADJUST(-8);
        TSAN_SAVE_CALLER_REGS
    /* Signal to TSan all stack frames exited by the perform. */
        movq    Stack_sp(%r10), %r11
        movq    STACK_RETADDR(%r11), C_ARG_1  /* arg 1: pc of perform */
        leaq    STACK_ARG_1(%r11), C_ARG_2    /* arg 2: sp at perform */
        movq    %r10, C_ARG_3                 /* arg 3: fiber */
        SWITCH_OCAML_TO_C
        C_call  (GCALL(caml_tsan_entry_on_resume))
        SWITCH_C_TO_OCAML
        TSAN_RESTORE_CALLER_REGS
        LEAVE_FUNCTION
#endif
    /* Add current stack to the last fiber */
        movq    (Stack_handler-1)(%rsi), %rdi
        movq    Caml_state(current_stack), %rsi
        movq    %rsi, Handler_parent(%rdi)
     /* Need to update the oldest saved frame pointer here as the current fiber
        stack may have been reallocated or we may be resuming a computation
        that was not originally run here. */
        UPDATE_BASE_POINTER(%rdi)
        SWITCH_OCAML_STACKS
        jmp     *(%rbx)
1:      ENTER_FUNCTION
        TSAN_ENTER_FUNCTION(0) /* Necessary to include the caller of caml_resume
                                  in the TSan backtrace */
        LEA_VAR(caml_raise_continuation_already_resumed, %rax)
        jmp LBL(caml_c_call)
CFI_ENDPROC
ENDFUNCTION(G(caml_resume))

/* Run a function on a new stack,
   then invoke either the value or exception handler */
FUNCTION(G(caml_runstack))
CFI_STARTPROC
        CFI_SIGNAL_FRAME
        ENTER_FUNCTION
#ifdef WITH_THREAD_SANITIZER
     /* Save non-callee-saved registers %rax and %rdi before C call */
        pushq   %rax; CFI_ADJUST(8);
        pushq   %rdi; CFI_ADJUST(8);
     /* Necessary to include the caller of caml_runstack in TSan backtrace */
        TSAN_ENTER_FUNCTION(16)
        popq   %rdi; CFI_ADJUST(-8);
        popq   %rax; CFI_ADJUST(-8);
#endif
    /* %rax -> fiber, %rbx -> fun, %rdi -> arg */
        andq    $-2, %rax       /* %rax = Ptr_val(%rax) */
    /* save old stack pointer and exception handler */
        movq    Caml_state(current_stack), %rcx
        movq    Caml_state(exn_handler), %r10
        movq    %rsp, Stack_sp(%rcx)
        movq    %r10, Stack_exception(%rcx)
    /* Load new stack pointer and set parent */
        movq    Stack_handler(%rax), %r11
        movq    %rcx, Handler_parent(%r11)
        movq    %rax, Caml_state(current_stack)
        movq    Stack_sp(%rax), %r11
    /* Create an exception handler on the target stack
       after 16byte DWARF & gc_regs block (which is unused here) */
        subq    $32, %r11
        leaq    LBL(fiber_exn_handler)(%rip), %r10
        movq    %r10, 8(%r11)
    /* link the previous exn_handler so that copying stacks works */
        movq    Stack_exception(%rax), %r10
        movq    %r10, 0(%r11)
        movq    %r11, Caml_state(exn_handler)
    /* Switch to the new stack */
        movq    %r11, %rsp
#ifdef ASM_CFI_SUPPORTED
        CFI_REMEMBER_STATE
        .cfi_escape DW_CFA_def_cfa_expression, 3+3+2, \
          DW_OP_breg + DW_REG_rsp,                    \
            16 /* exn */ +                            \
            8 /* gc_regs slot (unused) */ +           \
            8 /* C_STACK_SP for DWARF (unused) */     \
            + Handler_parent, DW_OP_deref,            \
          DW_OP_plus_uconst, Stack_sp, DW_OP_deref,   \
          DW_OP_plus_uconst, RETADDR_ENTRY_SIZE
#endif
        movq    %rdi, %rax /* first argument */
        callq   *(%rbx) /* closure in %rbx (second argument) */
LBL(frame_runstack):
        leaq    32(%rsp), %r11 /* SP with exn handler popped */
        movq    Handler_value(%r11), %rbx
1:      movq    Caml_state(current_stack), C_ARG_1 /* arg to caml_free_stack */
    /* restore parent stack and exn_handler into Caml_state */
        movq    Handler_parent(%r11), %r10
        movq    Stack_exception(%r10), %r11
        movq    %r10, Caml_state(current_stack)
        movq    %r11, Caml_state(exn_handler)
    /* free old stack by switching directly to c_stack; is a no-alloc call */
        movq    Stack_sp(%r10), %r13 /* saved across C call */
        CFI_RESTORE_STATE
        CFI_REMEMBER_STATE
        CFI_DEF_CFA_REGISTER(DW_REG_r13)
        movq    %rax, %r12 /* save %rax across C call */
        movq    Caml_state(c_stack), %rsp
        C_call  (GCALL(caml_free_stack))
    /* switch directly to parent stack with correct return */
        movq    %r13, %rsp
        CFI_RESTORE_STATE
    /* signal to TSan that we exit caml_runstack (no registers to save here) */
        TSAN_EXIT_FUNCTION
        movq    %r12, %rax
    /* Invoke handle_value (or handle_exn) */
        LEAVE_FUNCTION
        jmp     *(%rbx)
LBL(fiber_exn_handler):
        leaq    16(%rsp), %r11
        movq    Handler_exception(%r11), %rbx
        jmp     1b
CFI_ENDPROC
ENDFUNCTION(G(caml_runstack))

FUNCTION(G(caml_ml_array_bound_error))
CFI_STARTPROC
        ENTER_FUNCTION
    /* No registers require saving before C call to TSan */
        TSAN_ENTER_FUNCTION(0)
        LEA_VAR(caml_array_bound_error_asm, %rax)
        jmp     LBL(caml_c_call)
CFI_ENDPROC
ENDFUNCTION(G(caml_ml_array_bound_error))

FUNCTION(G(caml_assert_stack_invariants))
CFI_STARTPROC
        movq    Caml_state(current_stack), %r11
        movq    %rsp, %r10
        subq    %r11, %r10      /* %r10: number of bytes left on stack */
        /* can be two words over: the return addresses */
        cmp     $((Stack_threshold_words + Stack_ctx_words - 2)*8), %r10
        jge     1f
        int3
1:      ret
CFI_ENDPROC
ENDFUNCTION(G(caml_assert_stack_invariants))

        TEXT_SECTION(caml_system__code_end)
        .globl  G(caml_system__code_end)
G(caml_system__code_end):

        .data
        .globl  G(caml_system.frametable)
        .align  EIGHT_ALIGN
G(caml_system.frametable):
        .quad   2           /* two descriptors */
        .quad   LBL(108)    /* return address into callback */
        .value  -1          /* negative frame size => use callback link */
        .value  0           /* no roots here */
        .align  EIGHT_ALIGN
        .quad   LBL(frame_runstack) /* return address into fiber_val_handler */
        .value  -1          /* negative frame size => use callback link */
        .value  0           /* no roots here */

#if defined(SYS_macosx)
        .literal16
#elif defined(SYS_mingw64) || defined(SYS_cygwin)
        .section .rdata,"dr"
#else
        .section    .rodata.cst16,"aM",@progbits,16
#endif
        .globl  G(caml_negf_mask)
        .align  SIXTEEN_ALIGN
G(caml_negf_mask):
        .quad   0x8000000000000000, 0
        .globl  G(caml_absf_mask)
        .align  SIXTEEN_ALIGN
G(caml_absf_mask):
        .quad   0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF

#if defined(SYS_linux) || defined(SYS_freebsd)
    /* Mark stack as non-executable, PR#4564 */
        .section .note.GNU-stack,"",%progbits
#endif
