/* sp.c
 *
 * Copyright (C) 2006-2023 wolfSSL Inc.
 *
 * This file is part of wolfSSL.
 *
 * wolfSSL is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * wolfSSL is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
 */

/* Implementation by Sean Parkinson. */

#ifdef HAVE_CONFIG_H
    #include <config.h>
#endif

#include <wolfssl/wolfcrypt/settings.h>

#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) || \
    defined(WOLFSSL_HAVE_SP_ECC)

#include <wolfssl/wolfcrypt/error-crypt.h>
#include <wolfssl/wolfcrypt/cpuid.h>
#ifdef NO_INLINE
    #include <wolfssl/wolfcrypt/misc.h>
#else
    #define WOLFSSL_MISC_INCLUDED
    #include <wolfcrypt/src/misc.c>
#endif

#ifdef RSA_LOW_MEM
#ifndef SP_RSA_PRIVATE_EXP_D
#define SP_RSA_PRIVATE_EXP_D
#endif

#ifndef WOLFSSL_SP_SMALL
#define WOLFSSL_SP_SMALL
#endif
#endif

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
#undef WOLFSSL_SP_SMALL_STACK
#define WOLFSSL_SP_SMALL_STACK
#endif

#include <wolfssl/wolfcrypt/sp.h>

#ifdef __IAR_SYSTEMS_ICC__
#define __asm__        asm
#define __volatile__   volatile
#define WOLFSSL_NO_VAR_ASSIGN_REG
#endif /* __IAR_SYSTEMS_ICC__ */
#ifdef __KEIL__
#define __asm__        __asm
#define __volatile__   volatile
#endif

#ifndef WOLFSSL_SP_ASM
#if SP_WORD_SIZE == 32
#define SP_PRINT_NUM(var, name, total, words, bits)   \
    do {                                              \
        int ii;                                       \
        byte nb[(bits + 7) / 8];                      \
        sp_digit _s[words];                           \
        XMEMCPY(_s, var, sizeof(_s));                 \
        sp_##total##_norm_##words(_s);                \
        sp_##total##_to_bin_##words(_s, nb);          \
        fprintf(stderr, name "=0x");                  \
        for (ii=0; ii<(bits + 7) / 8; ii++)           \
            fprintf(stderr, "%02x", nb[ii]);          \
        fprintf(stderr, "\n");                        \
    } while (0)

#define SP_PRINT_VAL(var, name)                       \
    fprintf(stderr, name "=0x" SP_PRINT_FMT "\n", var)

#define SP_PRINT_INT(var, name)                       \
    fprintf(stderr, name "=%d\n", var)

#if ((defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)) && \
     ((!defined(WC_NO_CACHE_RESISTANT) && \
       (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH))) || \
      (defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP))) && \
    !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || (defined(WOLFSSL_SP_SMALL) && \
    defined(WOLFSSL_HAVE_SP_ECC) && (!defined(WOLFSSL_SP_NO_256) || \
    defined(WOLFSSL_SP_384) || defined(WOLFSSL_SP_521) || \
    defined(WOLFSSL_SP_1024)))
/* Mask for address to obfuscate which of the two address will be used. */
static const size_t addr_mask[2] = { 0, (size_t)-1 };
#endif

#if defined(WOLFSSL_SP_NONBLOCK) && (!defined(WOLFSSL_SP_NO_MALLOC) || \
                                     !defined(WOLFSSL_SP_SMALL))
    #error SP non-blocking requires small and no-malloc (WOLFSSL_SP_SMALL and WOLFSSL_SP_NO_MALLOC)
#endif

#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)
#ifndef WOLFSSL_SP_NO_2048
/* Read big endian unsigned byte array into r.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_2048_from_bin(sp_digit* r, int size, const byte* a, int n)
{
    int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = n-1; i >= 0; i--) {
        r[j] |= (((sp_digit)a[i]) << s);
        if (s >= 21U) {
            r[j] &= 0x1fffffff;
            s = 29U - s;
            if (j + 1 >= size) {
                break;
            }
            r[++j] = (sp_digit)a[i] >> s;
            s = 8U - s;
        }
        else {
            s += 8U;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  A multi-precision integer.
 */
static void sp_2048_from_mp(sp_digit* r, int size, const mp_int* a)
{
#if DIGIT_BIT == 29
    int i;
    sp_digit j = (sp_digit)0 - (sp_digit)a->used;
    int o = 0;

    for (i = 0; i < size; i++) {
        sp_digit mask = (sp_digit)0 - (j >> 28);
        r[i] = a->dp[o] & mask;
        j++;
        o += (int)(j >> 28);
    }
#elif DIGIT_BIT > 29
    unsigned int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i] << s);
        r[j] &= 0x1fffffff;
        s = 29U - s;
        if (j + 1 >= size) {
            break;
        }
        /* lint allow cast of mismatch word32 and mp_digit */
        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
        while ((s + 29U) <= (word32)DIGIT_BIT) {
            s += 29U;
            r[j] &= 0x1fffffff;
            if (j + 1 >= size) {
                break;
            }
            if (s < (word32)DIGIT_BIT) {
                /* lint allow cast of mismatch word32 and mp_digit */
                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
            }
            else {
                r[++j] = (sp_digit)0;
            }
        }
        s = (word32)DIGIT_BIT - s;
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#else
    unsigned int i;
    int j = 0;
    int s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 29) {
            r[j] &= 0x1fffffff;
            if (j + 1 >= size) {
                break;
            }
            s = 29 - s;
            if (s == DIGIT_BIT) {
                r[++j] = 0;
                s = 0;
            }
            else {
                r[++j] = a->dp[i] >> s;
                s = DIGIT_BIT - s;
            }
        }
        else {
            s += DIGIT_BIT;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#endif
}

/* Write r as big endian to byte array.
 * Fixed length number of bytes written: 256
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_2048_to_bin_72(sp_digit* r, byte* a)
{
    int i;
    int j;
    int s = 0;
    int b;

    for (i=0; i<71; i++) {
        r[i+1] += r[i] >> 29;
        r[i] &= 0x1fffffff;
    }
    j = 2055 / 8 - 1;
    a[j] = 0;
    for (i=0; i<71 && j>=0; i++) {
        b = 0;
        /* lint allow cast of mismatch sp_digit and int */
        a[j--] |= (byte)(r[i] << s); /*lint !e9033*/
        b += 8 - s;
        if (j < 0) {
            break;
        }
        while (b < 29) {
            a[j--] = (byte)(r[i] >> b);
            b += 8;
            if (j < 0) {
                break;
            }
        }
        s = 8 - (b - 29);
        if (j >= 0) {
            a[j] = 0;
        }
        if (s != 0) {
            j++;
        }
    }
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && (!defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(WOLFSSL_SP_SMALL))) || defined(WOLFSSL_HAVE_SP_DH)
/* Normalize the values in each word to 29 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_2048_norm_36(sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    for (i = 0; i < 35; i++) {
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
#else
    int i;
    for (i = 0; i < 32; i += 8) {
        a[i+1] += a[i+0] >> 29; a[i+0] &= 0x1fffffff;
        a[i+2] += a[i+1] >> 29; a[i+1] &= 0x1fffffff;
        a[i+3] += a[i+2] >> 29; a[i+2] &= 0x1fffffff;
        a[i+4] += a[i+3] >> 29; a[i+3] &= 0x1fffffff;
        a[i+5] += a[i+4] >> 29; a[i+4] &= 0x1fffffff;
        a[i+6] += a[i+5] >> 29; a[i+5] &= 0x1fffffff;
        a[i+7] += a[i+6] >> 29; a[i+6] &= 0x1fffffff;
        a[i+8] += a[i+7] >> 29; a[i+7] &= 0x1fffffff;
    }
    a[33] += a[32] >> 29; a[32] &= 0x1fffffff;
    a[34] += a[33] >> 29; a[33] &= 0x1fffffff;
    a[35] += a[34] >> 29; a[34] &= 0x1fffffff;
#endif /* WOLFSSL_SP_SMALL */
}

#endif /* (WOLFSSL_HAVE_SP_RSA && (!WOLFSSL_RSA_PUBLIC_ONLY || !WOLFSSL_SP_SMALL)) || WOLFSSL_HAVE_SP_DH */
/* Normalize the values in each word to 29 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_2048_norm_72(sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    for (i = 0; i < 71; i++) {
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
#else
    int i;
    for (i = 0; i < 64; i += 8) {
        a[i+1] += a[i+0] >> 29; a[i+0] &= 0x1fffffff;
        a[i+2] += a[i+1] >> 29; a[i+1] &= 0x1fffffff;
        a[i+3] += a[i+2] >> 29; a[i+2] &= 0x1fffffff;
        a[i+4] += a[i+3] >> 29; a[i+3] &= 0x1fffffff;
        a[i+5] += a[i+4] >> 29; a[i+4] &= 0x1fffffff;
        a[i+6] += a[i+5] >> 29; a[i+5] &= 0x1fffffff;
        a[i+7] += a[i+6] >> 29; a[i+6] &= 0x1fffffff;
        a[i+8] += a[i+7] >> 29; a[i+7] &= 0x1fffffff;
    }
    a[65] += a[64] >> 29; a[64] &= 0x1fffffff;
    a[66] += a[65] >> 29; a[65] &= 0x1fffffff;
    a[67] += a[66] >> 29; a[66] &= 0x1fffffff;
    a[68] += a[67] >> 29; a[67] &= 0x1fffffff;
    a[69] += a[68] >> 29; a[68] &= 0x1fffffff;
    a[70] += a[69] >> 29; a[69] &= 0x1fffffff;
    a[71] += a[70] >> 29; a[70] &= 0x1fffffff;
#endif /* WOLFSSL_SP_SMALL */
}

#ifndef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_2048_mul_12(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    sp_uint64 t0;
    sp_uint64 t1;
    sp_digit t[12];

    t0 = ((sp_uint64)a[ 0]) * b[ 0];
    t1 = ((sp_uint64)a[ 0]) * b[ 1]
       + ((sp_uint64)a[ 1]) * b[ 0];
    t[ 0] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_uint64)a[ 0]) * b[ 2]
       + ((sp_uint64)a[ 1]) * b[ 1]
       + ((sp_uint64)a[ 2]) * b[ 0];
    t[ 1] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_uint64)a[ 0]) * b[ 3]
       + ((sp_uint64)a[ 1]) * b[ 2]
       + ((sp_uint64)a[ 2]) * b[ 1]
       + ((sp_uint64)a[ 3]) * b[ 0];
    t[ 2] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_uint64)a[ 0]) * b[ 4]
       + ((sp_uint64)a[ 1]) * b[ 3]
       + ((sp_uint64)a[ 2]) * b[ 2]
       + ((sp_uint64)a[ 3]) * b[ 1]
       + ((sp_uint64)a[ 4]) * b[ 0];
    t[ 3] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_uint64)a[ 0]) * b[ 5]
       + ((sp_uint64)a[ 1]) * b[ 4]
       + ((sp_uint64)a[ 2]) * b[ 3]
       + ((sp_uint64)a[ 3]) * b[ 2]
       + ((sp_uint64)a[ 4]) * b[ 1]
       + ((sp_uint64)a[ 5]) * b[ 0];
    t[ 4] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_uint64)a[ 0]) * b[ 6]
       + ((sp_uint64)a[ 1]) * b[ 5]
       + ((sp_uint64)a[ 2]) * b[ 4]
       + ((sp_uint64)a[ 3]) * b[ 3]
       + ((sp_uint64)a[ 4]) * b[ 2]
       + ((sp_uint64)a[ 5]) * b[ 1]
       + ((sp_uint64)a[ 6]) * b[ 0];
    t[ 5] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_uint64)a[ 0]) * b[ 7]
       + ((sp_uint64)a[ 1]) * b[ 6]
       + ((sp_uint64)a[ 2]) * b[ 5]
       + ((sp_uint64)a[ 3]) * b[ 4]
       + ((sp_uint64)a[ 4]) * b[ 3]
       + ((sp_uint64)a[ 5]) * b[ 2]
       + ((sp_uint64)a[ 6]) * b[ 1]
       + ((sp_uint64)a[ 7]) * b[ 0];
    t[ 6] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_uint64)a[ 0]) * b[ 8]
       + ((sp_uint64)a[ 1]) * b[ 7]
       + ((sp_uint64)a[ 2]) * b[ 6]
       + ((sp_uint64)a[ 3]) * b[ 5]
       + ((sp_uint64)a[ 4]) * b[ 4]
       + ((sp_uint64)a[ 5]) * b[ 3]
       + ((sp_uint64)a[ 6]) * b[ 2]
       + ((sp_uint64)a[ 7]) * b[ 1]
       + ((sp_uint64)a[ 8]) * b[ 0];
    t[ 7] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_uint64)a[ 0]) * b[ 9]
       + ((sp_uint64)a[ 1]) * b[ 8]
       + ((sp_uint64)a[ 2]) * b[ 7]
       + ((sp_uint64)a[ 3]) * b[ 6]
       + ((sp_uint64)a[ 4]) * b[ 5]
       + ((sp_uint64)a[ 5]) * b[ 4]
       + ((sp_uint64)a[ 6]) * b[ 3]
       + ((sp_uint64)a[ 7]) * b[ 2]
       + ((sp_uint64)a[ 8]) * b[ 1]
       + ((sp_uint64)a[ 9]) * b[ 0];
    t[ 8] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_uint64)a[ 0]) * b[10]
       + ((sp_uint64)a[ 1]) * b[ 9]
       + ((sp_uint64)a[ 2]) * b[ 8]
       + ((sp_uint64)a[ 3]) * b[ 7]
       + ((sp_uint64)a[ 4]) * b[ 6]
       + ((sp_uint64)a[ 5]) * b[ 5]
       + ((sp_uint64)a[ 6]) * b[ 4]
       + ((sp_uint64)a[ 7]) * b[ 3]
       + ((sp_uint64)a[ 8]) * b[ 2]
       + ((sp_uint64)a[ 9]) * b[ 1]
       + ((sp_uint64)a[10]) * b[ 0];
    t[ 9] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_uint64)a[ 0]) * b[11]
       + ((sp_uint64)a[ 1]) * b[10]
       + ((sp_uint64)a[ 2]) * b[ 9]
       + ((sp_uint64)a[ 3]) * b[ 8]
       + ((sp_uint64)a[ 4]) * b[ 7]
       + ((sp_uint64)a[ 5]) * b[ 6]
       + ((sp_uint64)a[ 6]) * b[ 5]
       + ((sp_uint64)a[ 7]) * b[ 4]
       + ((sp_uint64)a[ 8]) * b[ 3]
       + ((sp_uint64)a[ 9]) * b[ 2]
       + ((sp_uint64)a[10]) * b[ 1]
       + ((sp_uint64)a[11]) * b[ 0];
    t[10] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_uint64)a[ 1]) * b[11]
       + ((sp_uint64)a[ 2]) * b[10]
       + ((sp_uint64)a[ 3]) * b[ 9]
       + ((sp_uint64)a[ 4]) * b[ 8]
       + ((sp_uint64)a[ 5]) * b[ 7]
       + ((sp_uint64)a[ 6]) * b[ 6]
       + ((sp_uint64)a[ 7]) * b[ 5]
       + ((sp_uint64)a[ 8]) * b[ 4]
       + ((sp_uint64)a[ 9]) * b[ 3]
       + ((sp_uint64)a[10]) * b[ 2]
       + ((sp_uint64)a[11]) * b[ 1];
    t[11] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_uint64)a[ 2]) * b[11]
       + ((sp_uint64)a[ 3]) * b[10]
       + ((sp_uint64)a[ 4]) * b[ 9]
       + ((sp_uint64)a[ 5]) * b[ 8]
       + ((sp_uint64)a[ 6]) * b[ 7]
       + ((sp_uint64)a[ 7]) * b[ 6]
       + ((sp_uint64)a[ 8]) * b[ 5]
       + ((sp_uint64)a[ 9]) * b[ 4]
       + ((sp_uint64)a[10]) * b[ 3]
       + ((sp_uint64)a[11]) * b[ 2];
    r[12] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_uint64)a[ 3]) * b[11]
       + ((sp_uint64)a[ 4]) * b[10]
       + ((sp_uint64)a[ 5]) * b[ 9]
       + ((sp_uint64)a[ 6]) * b[ 8]
       + ((sp_uint64)a[ 7]) * b[ 7]
       + ((sp_uint64)a[ 8]) * b[ 6]
       + ((sp_uint64)a[ 9]) * b[ 5]
       + ((sp_uint64)a[10]) * b[ 4]
       + ((sp_uint64)a[11]) * b[ 3];
    r[13] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_uint64)a[ 4]) * b[11]
       + ((sp_uint64)a[ 5]) * b[10]
       + ((sp_uint64)a[ 6]) * b[ 9]
       + ((sp_uint64)a[ 7]) * b[ 8]
       + ((sp_uint64)a[ 8]) * b[ 7]
       + ((sp_uint64)a[ 9]) * b[ 6]
       + ((sp_uint64)a[10]) * b[ 5]
       + ((sp_uint64)a[11]) * b[ 4];
    r[14] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_uint64)a[ 5]) * b[11]
       + ((sp_uint64)a[ 6]) * b[10]
       + ((sp_uint64)a[ 7]) * b[ 9]
       + ((sp_uint64)a[ 8]) * b[ 8]
       + ((sp_uint64)a[ 9]) * b[ 7]
       + ((sp_uint64)a[10]) * b[ 6]
       + ((sp_uint64)a[11]) * b[ 5];
    r[15] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_uint64)a[ 6]) * b[11]
       + ((sp_uint64)a[ 7]) * b[10]
       + ((sp_uint64)a[ 8]) * b[ 9]
       + ((sp_uint64)a[ 9]) * b[ 8]
       + ((sp_uint64)a[10]) * b[ 7]
       + ((sp_uint64)a[11]) * b[ 6];
    r[16] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_uint64)a[ 7]) * b[11]
       + ((sp_uint64)a[ 8]) * b[10]
       + ((sp_uint64)a[ 9]) * b[ 9]
       + ((sp_uint64)a[10]) * b[ 8]
       + ((sp_uint64)a[11]) * b[ 7];
    r[17] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_uint64)a[ 8]) * b[11]
       + ((sp_uint64)a[ 9]) * b[10]
       + ((sp_uint64)a[10]) * b[ 9]
       + ((sp_uint64)a[11]) * b[ 8];
    r[18] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_uint64)a[ 9]) * b[11]
       + ((sp_uint64)a[10]) * b[10]
       + ((sp_uint64)a[11]) * b[ 9];
    r[19] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_uint64)a[10]) * b[11]
       + ((sp_uint64)a[11]) * b[10];
    r[20] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_uint64)a[11]) * b[11];
    r[21] = t1 & 0x1fffffff; t0 += t1 >> 29;
    r[22] = t0 & 0x1fffffff;
    r[23] = (sp_digit)(t0 >> 29);
    XMEMCPY(r, t, sizeof(t));
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_2048_add_12(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    r[ 0] = a[ 0] + b[ 0];
    r[ 1] = a[ 1] + b[ 1];
    r[ 2] = a[ 2] + b[ 2];
    r[ 3] = a[ 3] + b[ 3];
    r[ 4] = a[ 4] + b[ 4];
    r[ 5] = a[ 5] + b[ 5];
    r[ 6] = a[ 6] + b[ 6];
    r[ 7] = a[ 7] + b[ 7];
    r[ 8] = a[ 8] + b[ 8];
    r[ 9] = a[ 9] + b[ 9];
    r[10] = a[10] + b[10];
    r[11] = a[11] + b[11];

    return 0;
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_2048_sub_24(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 24; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }

    return 0;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_2048_add_24(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 24; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }

    return 0;
}

/* Normalize the values in each word to 29 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_2048_norm_12(sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    for (i = 0; i < 11; i++) {
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
#else
    a[1] += a[0] >> 29; a[0] &= 0x1fffffff;
    a[2] += a[1] >> 29; a[1] &= 0x1fffffff;
    a[3] += a[2] >> 29; a[2] &= 0x1fffffff;
    a[4] += a[3] >> 29; a[3] &= 0x1fffffff;
    a[5] += a[4] >> 29; a[4] &= 0x1fffffff;
    a[6] += a[5] >> 29; a[5] &= 0x1fffffff;
    a[7] += a[6] >> 29; a[6] &= 0x1fffffff;
    a[8] += a[7] >> 29; a[7] &= 0x1fffffff;
    a[9] += a[8] >> 29; a[8] &= 0x1fffffff;
    a[10] += a[9] >> 29; a[9] &= 0x1fffffff;
    a[11] += a[10] >> 29; a[10] &= 0x1fffffff;
#endif /* WOLFSSL_SP_SMALL */
}

/* Normalize the values in each word to 29 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_2048_norm_24(sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    for (i = 0; i < 23; i++) {
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
#else
    int i;
    for (i = 0; i < 16; i += 8) {
        a[i+1] += a[i+0] >> 29; a[i+0] &= 0x1fffffff;
        a[i+2] += a[i+1] >> 29; a[i+1] &= 0x1fffffff;
        a[i+3] += a[i+2] >> 29; a[i+2] &= 0x1fffffff;
        a[i+4] += a[i+3] >> 29; a[i+3] &= 0x1fffffff;
        a[i+5] += a[i+4] >> 29; a[i+4] &= 0x1fffffff;
        a[i+6] += a[i+5] >> 29; a[i+5] &= 0x1fffffff;
        a[i+7] += a[i+6] >> 29; a[i+6] &= 0x1fffffff;
        a[i+8] += a[i+7] >> 29; a[i+7] &= 0x1fffffff;
    }
    a[17] += a[16] >> 29; a[16] &= 0x1fffffff;
    a[18] += a[17] >> 29; a[17] &= 0x1fffffff;
    a[19] += a[18] >> 29; a[18] &= 0x1fffffff;
    a[20] += a[19] >> 29; a[19] &= 0x1fffffff;
    a[21] += a[20] >> 29; a[20] &= 0x1fffffff;
    a[22] += a[21] >> 29; a[21] &= 0x1fffffff;
    a[23] += a[22] >> 29; a[22] &= 0x1fffffff;
#endif /* WOLFSSL_SP_SMALL */
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_2048_mul_36(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    sp_digit p0[24];
    sp_digit p1[24];
    sp_digit p2[24];
    sp_digit p3[24];
    sp_digit p4[24];
    sp_digit p5[24];
    sp_digit t0[24];
    sp_digit t1[24];
    sp_digit t2[24];
    sp_digit a0[12];
    sp_digit a1[12];
    sp_digit a2[12];
    sp_digit b0[12];
    sp_digit b1[12];
    sp_digit b2[12];
    (void)sp_2048_add_12(a0, a, &a[12]);
    sp_2048_norm_12(a0);
    (void)sp_2048_add_12(b0, b, &b[12]);
    sp_2048_norm_12(b0);
    (void)sp_2048_add_12(a1, &a[12], &a[24]);
    sp_2048_norm_12(a1);
    (void)sp_2048_add_12(b1, &b[12], &b[24]);
    sp_2048_norm_12(b1);
    (void)sp_2048_add_12(a2, a0, &a[24]);
    sp_2048_norm_12(a1);
    (void)sp_2048_add_12(b2, b0, &b[24]);
    sp_2048_norm_12(b2);
    sp_2048_mul_12(p0, a, b);
    sp_2048_mul_12(p2, &a[12], &b[12]);
    sp_2048_mul_12(p4, &a[24], &b[24]);
    sp_2048_mul_12(p1, a0, b0);
    sp_2048_mul_12(p3, a1, b1);
    sp_2048_mul_12(p5, a2, b2);
    XMEMSET(r, 0, sizeof(*r)*2U*36U);
    (void)sp_2048_sub_24(t0, p3, p2);
    (void)sp_2048_sub_24(t1, p1, p2);
    (void)sp_2048_sub_24(t2, p5, t0);
    (void)sp_2048_sub_24(t2, t2, t1);
    sp_2048_norm_24(t2);
    (void)sp_2048_sub_24(t0, t0, p4);
    sp_2048_norm_24(t0);
    (void)sp_2048_sub_24(t1, t1, p0);
    sp_2048_norm_24(t1);
    (void)sp_2048_add_24(r, r, p0);
    (void)sp_2048_add_24(&r[12], &r[12], t1);
    (void)sp_2048_add_24(&r[24], &r[24], t2);
    (void)sp_2048_add_24(&r[36], &r[36], t0);
    (void)sp_2048_add_24(&r[48], &r[48], p4);
    sp_2048_norm_72(r);
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 32; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[32] = a[32] + b[32];
    r[33] = a[33] + b[33];
    r[34] = a[34] + b[34];
    r[35] = a[35] + b[35];

    return 0;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_2048_add_72(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 72; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }

    return 0;
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_2048_sub_72(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 72; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }

    return 0;
}

/* Normalize the values in each word to 29 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_2048_norm_144(sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    for (i = 0; i < 143; i++) {
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
#else
    int i;
    for (i = 0; i < 136; i += 8) {
        a[i+1] += a[i+0] >> 29; a[i+0] &= 0x1fffffff;
        a[i+2] += a[i+1] >> 29; a[i+1] &= 0x1fffffff;
        a[i+3] += a[i+2] >> 29; a[i+2] &= 0x1fffffff;
        a[i+4] += a[i+3] >> 29; a[i+3] &= 0x1fffffff;
        a[i+5] += a[i+4] >> 29; a[i+4] &= 0x1fffffff;
        a[i+6] += a[i+5] >> 29; a[i+5] &= 0x1fffffff;
        a[i+7] += a[i+6] >> 29; a[i+6] &= 0x1fffffff;
        a[i+8] += a[i+7] >> 29; a[i+7] &= 0x1fffffff;
    }
    a[137] += a[136] >> 29; a[136] &= 0x1fffffff;
    a[138] += a[137] >> 29; a[137] &= 0x1fffffff;
    a[139] += a[138] >> 29; a[138] &= 0x1fffffff;
    a[140] += a[139] >> 29; a[139] &= 0x1fffffff;
    a[141] += a[140] >> 29; a[140] &= 0x1fffffff;
    a[142] += a[141] >> 29; a[141] &= 0x1fffffff;
    a[143] += a[142] >> 29; a[142] &= 0x1fffffff;
#endif /* WOLFSSL_SP_SMALL */
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_2048_mul_72(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[72];
    sp_digit* a1 = z1;
    sp_digit b1[36];
    sp_digit* z2 = r + 72;
    (void)sp_2048_add_36(a1, a, &a[36]);
    sp_2048_norm_36(a1);
    (void)sp_2048_add_36(b1, b, &b[36]);
    sp_2048_norm_36(b1);
    sp_2048_mul_36(z2, &a[36], &b[36]);
    sp_2048_mul_36(z0, a, b);
    sp_2048_mul_36(z1, a1, b1);
    (void)sp_2048_sub_72(z1, z1, z2);
    (void)sp_2048_sub_72(z1, z1, z0);
    (void)sp_2048_add_72(r + 36, r + 36, z1);
    sp_2048_norm_144(r);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_2048_sqr_12(sp_digit* r, const sp_digit* a)
{
    sp_uint64 t0;
    sp_uint64 t1;
    sp_digit t[12];

    t0 =  ((sp_uint64)a[ 0]) * a[ 0];
    t1 = (((sp_uint64)a[ 0]) * a[ 1]) * 2;
    t[ 0] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_uint64)a[ 0]) * a[ 2]) * 2
       +  ((sp_uint64)a[ 1]) * a[ 1];
    t[ 1] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_uint64)a[ 0]) * a[ 3]
       +  ((sp_uint64)a[ 1]) * a[ 2]) * 2;
    t[ 2] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_uint64)a[ 0]) * a[ 4]
       +  ((sp_uint64)a[ 1]) * a[ 3]) * 2
       +  ((sp_uint64)a[ 2]) * a[ 2];
    t[ 3] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_uint64)a[ 0]) * a[ 5]
       +  ((sp_uint64)a[ 1]) * a[ 4]
       +  ((sp_uint64)a[ 2]) * a[ 3]) * 2;
    t[ 4] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_uint64)a[ 0]) * a[ 6]
       +  ((sp_uint64)a[ 1]) * a[ 5]
       +  ((sp_uint64)a[ 2]) * a[ 4]) * 2
       +  ((sp_uint64)a[ 3]) * a[ 3];
    t[ 5] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_uint64)a[ 0]) * a[ 7]
       +  ((sp_uint64)a[ 1]) * a[ 6]
       +  ((sp_uint64)a[ 2]) * a[ 5]
       +  ((sp_uint64)a[ 3]) * a[ 4]) * 2;
    t[ 6] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_uint64)a[ 0]) * a[ 8]
       +  ((sp_uint64)a[ 1]) * a[ 7]
       +  ((sp_uint64)a[ 2]) * a[ 6]
       +  ((sp_uint64)a[ 3]) * a[ 5]) * 2
       +  ((sp_uint64)a[ 4]) * a[ 4];
    t[ 7] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_uint64)a[ 0]) * a[ 9]
       +  ((sp_uint64)a[ 1]) * a[ 8]
       +  ((sp_uint64)a[ 2]) * a[ 7]
       +  ((sp_uint64)a[ 3]) * a[ 6]
       +  ((sp_uint64)a[ 4]) * a[ 5]) * 2;
    t[ 8] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_uint64)a[ 0]) * a[10]
       +  ((sp_uint64)a[ 1]) * a[ 9]
       +  ((sp_uint64)a[ 2]) * a[ 8]
       +  ((sp_uint64)a[ 3]) * a[ 7]
       +  ((sp_uint64)a[ 4]) * a[ 6]) * 2
       +  ((sp_uint64)a[ 5]) * a[ 5];
    t[ 9] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_uint64)a[ 0]) * a[11]
       +  ((sp_uint64)a[ 1]) * a[10]
       +  ((sp_uint64)a[ 2]) * a[ 9]
       +  ((sp_uint64)a[ 3]) * a[ 8]
       +  ((sp_uint64)a[ 4]) * a[ 7]
       +  ((sp_uint64)a[ 5]) * a[ 6]) * 2;
    t[10] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_uint64)a[ 1]) * a[11]
       +  ((sp_uint64)a[ 2]) * a[10]
       +  ((sp_uint64)a[ 3]) * a[ 9]
       +  ((sp_uint64)a[ 4]) * a[ 8]
       +  ((sp_uint64)a[ 5]) * a[ 7]) * 2
       +  ((sp_uint64)a[ 6]) * a[ 6];
    t[11] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_uint64)a[ 2]) * a[11]
       +  ((sp_uint64)a[ 3]) * a[10]
       +  ((sp_uint64)a[ 4]) * a[ 9]
       +  ((sp_uint64)a[ 5]) * a[ 8]
       +  ((sp_uint64)a[ 6]) * a[ 7]) * 2;
    r[12] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_uint64)a[ 3]) * a[11]
       +  ((sp_uint64)a[ 4]) * a[10]
       +  ((sp_uint64)a[ 5]) * a[ 9]
       +  ((sp_uint64)a[ 6]) * a[ 8]) * 2
       +  ((sp_uint64)a[ 7]) * a[ 7];
    r[13] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_uint64)a[ 4]) * a[11]
       +  ((sp_uint64)a[ 5]) * a[10]
       +  ((sp_uint64)a[ 6]) * a[ 9]
       +  ((sp_uint64)a[ 7]) * a[ 8]) * 2;
    r[14] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_uint64)a[ 5]) * a[11]
       +  ((sp_uint64)a[ 6]) * a[10]
       +  ((sp_uint64)a[ 7]) * a[ 9]) * 2
       +  ((sp_uint64)a[ 8]) * a[ 8];
    r[15] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_uint64)a[ 6]) * a[11]
       +  ((sp_uint64)a[ 7]) * a[10]
       +  ((sp_uint64)a[ 8]) * a[ 9]) * 2;
    r[16] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_uint64)a[ 7]) * a[11]
       +  ((sp_uint64)a[ 8]) * a[10]) * 2
       +  ((sp_uint64)a[ 9]) * a[ 9];
    r[17] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_uint64)a[ 8]) * a[11]
       +  ((sp_uint64)a[ 9]) * a[10]) * 2;
    r[18] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_uint64)a[ 9]) * a[11]) * 2
       +  ((sp_uint64)a[10]) * a[10];
    r[19] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_uint64)a[10]) * a[11]) * 2;
    r[20] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 =  ((sp_uint64)a[11]) * a[11];
    r[21] = t1 & 0x1fffffff; t0 += t1 >> 29;
    r[22] = t0 & 0x1fffffff;
    r[23] = (sp_digit)(t0 >> 29);
    XMEMCPY(r, t, sizeof(t));
}

/* Square a into r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_2048_sqr_36(sp_digit* r, const sp_digit* a)
{
    sp_digit p0[24];
    sp_digit p1[24];
    sp_digit p2[24];
    sp_digit p3[24];
    sp_digit p4[24];
    sp_digit p5[24];
    sp_digit t0[24];
    sp_digit t1[24];
    sp_digit t2[24];
    sp_digit a0[12];
    sp_digit a1[12];
    sp_digit a2[12];
    (void)sp_2048_add_12(a0, a, &a[12]);
    sp_2048_norm_12(a0);
    (void)sp_2048_add_12(a1, &a[12], &a[24]);
    sp_2048_norm_12(a1);
    (void)sp_2048_add_12(a2, a0, &a[24]);
    sp_2048_norm_12(a2);
    sp_2048_sqr_12(p0, a);
    sp_2048_sqr_12(p2, &a[12]);
    sp_2048_sqr_12(p4, &a[24]);
    sp_2048_sqr_12(p1, a0);
    sp_2048_sqr_12(p3, a1);
    sp_2048_sqr_12(p5, a2);
    XMEMSET(r, 0, sizeof(*r)*2U*36U);
    (void)sp_2048_sub_24(t0, p3, p2);
    (void)sp_2048_sub_24(t1, p1, p2);
    (void)sp_2048_sub_24(t2, p5, t0);
    (void)sp_2048_sub_24(t2, t2, t1);
    sp_2048_norm_24(t2);
    (void)sp_2048_sub_24(t0, t0, p4);
    sp_2048_norm_24(t0);
    (void)sp_2048_sub_24(t1, t1, p0);
    sp_2048_norm_24(t1);
    (void)sp_2048_add_24(r, r, p0);
    (void)sp_2048_add_24(&r[12], &r[12], t1);
    (void)sp_2048_add_24(&r[24], &r[24], t2);
    (void)sp_2048_add_24(&r[36], &r[36], t0);
    (void)sp_2048_add_24(&r[48], &r[48], p4);
    sp_2048_norm_72(r);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_2048_sqr_72(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit z1[72];
    sp_digit* a1 = z1;
    sp_digit* z2 = r + 72;
    (void)sp_2048_add_36(a1, a, &a[36]);
    sp_2048_norm_36(a1);
    sp_2048_sqr_36(z2, &a[36]);
    sp_2048_sqr_36(z0, a);
    sp_2048_sqr_36(z1, a1);
    (void)sp_2048_sub_72(z1, z1, z2);
    (void)sp_2048_sub_72(z1, z1, z0);
    (void)sp_2048_add_72(r + 36, r + 36, z1);
    sp_2048_norm_144(r);
}

#endif /* !WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_2048_add_72(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 72; i++) {
        r[i] = a[i] + b[i];
    }

    return 0;
}
#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_2048_sub_72(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 72; i++) {
        r[i] = a[i] - b[i];
    }

    return 0;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_2048_mul_72(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    int i;
    int imax;
    int k;
    sp_uint64 c;
    sp_uint64 lo;

    c = ((sp_uint64)a[71]) * b[71];
    r[143] = (sp_digit)(c >> 29);
    c &= 0x1fffffff;
    for (k = 141; k >= 0; k--) {
        if (k >= 72) {
            i = k - 71;
            imax = 71;
        }
        else {
            i = 0;
            imax = k;
        }
        if (imax - i > 15) {
            int imaxlo;
            lo = 0;
            for (imaxlo = i; imaxlo <= imax; imaxlo += 15) {
                for (; i <= imax && i < imaxlo + 15; i++) {
                    lo += ((sp_uint64)a[i]) * b[k - i];
                }
                c += lo >> 29;
                lo &= 0x1fffffff;
            }
            r[k + 2] += (sp_digit)(c >> 29);
            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
            c = lo & 0x1fffffff;
        }
        else {
            lo = 0;
            for (; i <= imax; i++) {
                lo += ((sp_uint64)a[i]) * b[k - i];
            }
            c += lo >> 29;
            r[k + 2] += (sp_digit)(c >> 29);
            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
            c = lo & 0x1fffffff;
        }
    }
    r[0] = (sp_digit)c;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_2048_sqr_72(sp_digit* r, const sp_digit* a)
{
    int i;
    int imax;
    int k;
    sp_uint64 c;
    sp_uint64 t;

    c = ((sp_uint64)a[71]) * a[71];
    r[143] = (sp_digit)(c >> 29);
    c = (c & 0x1fffffff) << 29;
    for (k = 141; k >= 0; k--) {
        i = (k + 1) / 2;
        if ((k & 1) == 0) {
           c += ((sp_uint64)a[i]) * a[i];
           i++;
        }
        if (k < 71) {
            imax = k;
        }
        else {
            imax = 71;
        }
        if (imax - i >= 14) {
            int imaxlo;
            sp_uint64 hi;

            hi = c >> 29;
            c &= 0x1fffffff;
            for (imaxlo = i; imaxlo <= imax; imaxlo += 14) {
                t = 0;
                for (; i <= imax && i < imaxlo + 14; i++) {
                    t += ((sp_uint64)a[i]) * a[k - i];
                }
                c += t * 2;

                hi += c >> 29;
                c &= 0x1fffffff;
            }
            r[k + 2] += (sp_digit)(hi >> 29);
            r[k + 1]  = (sp_digit)(hi & 0x1fffffff);
            c <<= 29;
        }
        else
        {
            t = 0;
            for (; i <= imax; i++) {
                t += ((sp_uint64)a[i]) * a[k - i];
            }
            c += t * 2;

            r[k + 2] += (sp_digit) (c >> 58);
            r[k + 1]  = (sp_digit)((c >> 29) & 0x1fffffff);
            c = (c & 0x1fffffff) << 29;
        }
    }
    r[0] = (sp_digit)(c >> 29);
}

#endif /* WOLFSSL_SP_SMALL */
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_2048_add_36(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 36; i++) {
        r[i] = a[i] + b[i];
    }

    return 0;
}
#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 36; i++) {
        r[i] = a[i] - b[i];
    }

    return 0;
}

#else
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_2048_sub_36(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 32; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }
    r[32] = a[32] - b[32];
    r[33] = a[33] - b[33];
    r[34] = a[34] - b[34];
    r[35] = a[35] - b[35];

    return 0;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_2048_mul_36(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    int i;
    int imax;
    int k;
    sp_uint64 c;
    sp_uint64 lo;

    c = ((sp_uint64)a[35]) * b[35];
    r[71] = (sp_digit)(c >> 29);
    c &= 0x1fffffff;
    for (k = 69; k >= 0; k--) {
        if (k >= 36) {
            i = k - 35;
            imax = 35;
        }
        else {
            i = 0;
            imax = k;
        }
        if (imax - i > 15) {
            int imaxlo;
            lo = 0;
            for (imaxlo = i; imaxlo <= imax; imaxlo += 15) {
                for (; i <= imax && i < imaxlo + 15; i++) {
                    lo += ((sp_uint64)a[i]) * b[k - i];
                }
                c += lo >> 29;
                lo &= 0x1fffffff;
            }
            r[k + 2] += (sp_digit)(c >> 29);
            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
            c = lo & 0x1fffffff;
        }
        else {
            lo = 0;
            for (; i <= imax; i++) {
                lo += ((sp_uint64)a[i]) * b[k - i];
            }
            c += lo >> 29;
            r[k + 2] += (sp_digit)(c >> 29);
            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
            c = lo & 0x1fffffff;
        }
    }
    r[0] = (sp_digit)c;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_2048_sqr_36(sp_digit* r, const sp_digit* a)
{
    int i;
    int imax;
    int k;
    sp_uint64 c;
    sp_uint64 t;

    c = ((sp_uint64)a[35]) * a[35];
    r[71] = (sp_digit)(c >> 29);
    c = (c & 0x1fffffff) << 29;
    for (k = 69; k >= 0; k--) {
        i = (k + 1) / 2;
        if ((k & 1) == 0) {
           c += ((sp_uint64)a[i]) * a[i];
           i++;
        }
        if (k < 35) {
            imax = k;
        }
        else {
            imax = 35;
        }
        if (imax - i >= 14) {
            int imaxlo;
            sp_uint64 hi;

            hi = c >> 29;
            c &= 0x1fffffff;
            for (imaxlo = i; imaxlo <= imax; imaxlo += 14) {
                t = 0;
                for (; i <= imax && i < imaxlo + 14; i++) {
                    t += ((sp_uint64)a[i]) * a[k - i];
                }
                c += t * 2;

                hi += c >> 29;
                c &= 0x1fffffff;
            }
            r[k + 2] += (sp_digit)(hi >> 29);
            r[k + 1]  = (sp_digit)(hi & 0x1fffffff);
            c <<= 29;
        }
        else
        {
            t = 0;
            for (; i <= imax; i++) {
                t += ((sp_uint64)a[i]) * a[k - i];
            }
            c += t * 2;

            r[k + 2] += (sp_digit) (c >> 58);
            r[k + 1]  = (sp_digit)((c >> 29) & 0x1fffffff);
            c = (c & 0x1fffffff) << 29;
        }
    }
    r[0] = (sp_digit)(c >> 29);
}

#endif /* WOLFSSL_SP_SMALL */
#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */

/* Calculate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_2048_mont_setup(const sp_digit* a, sp_digit* rho)
{
    sp_digit x;
    sp_digit b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
    x &= 0x1fffffff;

    /* rho = -1/m mod b */
    *rho = ((sp_digit)1 << 29) - x;
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_2048_mul_d_72(sp_digit* r, const sp_digit* a,
    sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 72; i++) {
        t += tb * a[i];
        r[i] = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
    }
    r[72] = (sp_digit)t;
#else
    sp_int64 tb = b;
    sp_int64 t = 0;
    sp_digit t2;
    sp_int64 p[4];
    int i;

    for (i = 0; i < 72; i += 4) {
        p[0] = tb * a[i + 0];
        p[1] = tb * a[i + 1];
        p[2] = tb * a[i + 2];
        p[3] = tb * a[i + 3];
        t += p[0];
        t2 = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
        r[i + 0] = (sp_digit)t2;
        t += p[1];
        t2 = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
        r[i + 1] = (sp_digit)t2;
        t += p[2];
        t2 = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
        r[i + 2] = (sp_digit)t2;
        t += p[3];
        t2 = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
        r[i + 3] = (sp_digit)t2;
    }
    r[72] = (sp_digit)(t & 0x1fffffff);
#endif /* WOLFSSL_SP_SMALL */
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 2048 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_2048_mont_norm_36(sp_digit* r, const sp_digit* m)
{
    /* Set r = 2^n - 1. */
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<35; i++) {
        r[i] = 0x1fffffff;
    }
#else
    int i;

    for (i = 0; i < 32; i += 8) {
        r[i + 0] = 0x1fffffff;
        r[i + 1] = 0x1fffffff;
        r[i + 2] = 0x1fffffff;
        r[i + 3] = 0x1fffffff;
        r[i + 4] = 0x1fffffff;
        r[i + 5] = 0x1fffffff;
        r[i + 6] = 0x1fffffff;
        r[i + 7] = 0x1fffffff;
    }
    r[32] = 0x1fffffff;
    r[33] = 0x1fffffff;
    r[34] = 0x1fffffff;
#endif /* WOLFSSL_SP_SMALL */
    r[35] = 0x1ffL;

    /* r = (2^n - 1) mod n */
    (void)sp_2048_sub_36(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_2048_cmp_36(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=35; i>=0; i--) {
        r |= (a[i] - b[i]) & ~(((sp_digit)0 - r) >> 28);
    }
#else
    int i;

    r |= (a[35] - b[35]) & (0 - (sp_digit)1);
    r |= (a[34] - b[34]) & ~(((sp_digit)0 - r) >> 28);
    r |= (a[33] - b[33]) & ~(((sp_digit)0 - r) >> 28);
    r |= (a[32] - b[32]) & ~(((sp_digit)0 - r) >> 28);
    for (i = 24; i >= 0; i -= 8) {
        r |= (a[i + 7] - b[i + 7]) & ~(((sp_digit)0 - r) >> 28);
        r |= (a[i + 6] - b[i + 6]) & ~(((sp_digit)0 - r) >> 28);
        r |= (a[i + 5] - b[i + 5]) & ~(((sp_digit)0 - r) >> 28);
        r |= (a[i + 4] - b[i + 4]) & ~(((sp_digit)0 - r) >> 28);
        r |= (a[i + 3] - b[i + 3]) & ~(((sp_digit)0 - r) >> 28);
        r |= (a[i + 2] - b[i + 2]) & ~(((sp_digit)0 - r) >> 28);
        r |= (a[i + 1] - b[i + 1]) & ~(((sp_digit)0 - r) >> 28);
        r |= (a[i + 0] - b[i + 0]) & ~(((sp_digit)0 - r) >> 28);
    }
#endif /* WOLFSSL_SP_SMALL */

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_2048_cond_sub_36(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 36; i++) {
        r[i] = a[i] - (b[i] & m);
    }
#else
    int i;

    for (i = 0; i < 32; i += 8) {
        r[i + 0] = a[i + 0] - (b[i + 0] & m);
        r[i + 1] = a[i + 1] - (b[i + 1] & m);
        r[i + 2] = a[i + 2] - (b[i + 2] & m);
        r[i + 3] = a[i + 3] - (b[i + 3] & m);
        r[i + 4] = a[i + 4] - (b[i + 4] & m);
        r[i + 5] = a[i + 5] - (b[i + 5] & m);
        r[i + 6] = a[i + 6] - (b[i + 6] & m);
        r[i + 7] = a[i + 7] - (b[i + 7] & m);
    }
    r[32] = a[32] - (b[32] & m);
    r[33] = a[33] - (b[33] & m);
    r[34] = a[34] - (b[34] & m);
    r[35] = a[35] - (b[35] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_2048_mul_add_36(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifndef WOLFSSL_SP_LARGE_CODE
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 36; i++) {
        t += r[i];
        t += tb * a[i];
        r[i] = ((sp_digit)t) & 0x1fffffff;
        t >>= 29;
    }
    r[36] += (sp_digit)t;
#else
#ifdef WOLFSSL_SP_SMALL
    sp_int64 tb = b;
    sp_int64 t[4];
    int i;

    t[0] = 0;
    for (i = 0; i < 32; i += 4) {
        t[0] += (tb * a[i+0]) + r[i+0];
        t[1]  = (tb * a[i+1]) + r[i+1];
        t[2]  = (tb * a[i+2]) + r[i+2];
        t[3]  = (tb * a[i+3]) + r[i+3];
        r[i+0] = t[0] & 0x1fffffff;
        t[1] += t[0] >> 29;
        r[i+1] = t[1] & 0x1fffffff;
        t[2] += t[1] >> 29;
        r[i+2] = t[2] & 0x1fffffff;
        t[3] += t[2] >> 29;
        r[i+3] = t[3] & 0x1fffffff;
        t[0]  = t[3] >> 29;
    }
    t[0] += (tb * a[32]) + r[32];
    t[1]  = (tb * a[33]) + r[33];
    t[2]  = (tb * a[34]) + r[34];
    t[3]  = (tb * a[35]) + r[35];
    r[32] = t[0] & 0x1fffffff;
    t[1] += t[0] >> 29;
    r[33] = t[1] & 0x1fffffff;
    t[2] += t[1] >> 29;
    r[34] = t[2] & 0x1fffffff;
    t[3] += t[2] >> 29;
    r[35] = t[3] & 0x1fffffff;
    r[36] +=  (sp_digit)(t[3] >> 29);
#else
    sp_int64 tb = b;
    sp_int64 t[8];
    int i;

    t[0] = 0;
    for (i = 0; i < 32; i += 8) {
        t[0] += (tb * a[i+0]) + r[i+0];
        t[1]  = (tb * a[i+1]) + r[i+1];
        t[2]  = (tb * a[i+2]) + r[i+2];
        t[3]  = (tb * a[i+3]) + r[i+3];
        t[4]  = (tb * a[i+4]) + r[i+4];
        t[5]  = (tb * a[i+5]) + r[i+5];
        t[6]  = (tb * a[i+6]) + r[i+6];
        t[7]  = (tb * a[i+7]) + r[i+7];
        r[i+0] = t[0] & 0x1fffffff;
        t[1] += t[0] >> 29;
        r[i+1] = t[1] & 0x1fffffff;
        t[2] += t[1] >> 29;
        r[i+2] = t[2] & 0x1fffffff;
        t[3] += t[2] >> 29;
        r[i+3] = t[3] & 0x1fffffff;
        t[4] += t[3] >> 29;
        r[i+4] = t[4] & 0x1fffffff;
        t[5] += t[4] >> 29;
        r[i+5] = t[5] & 0x1fffffff;
        t[6] += t[5] >> 29;
        r[i+6] = t[6] & 0x1fffffff;
        t[7] += t[6] >> 29;
        r[i+7] = t[7] & 0x1fffffff;
        t[0]  = t[7] >> 29;
    }
    t[0] += (tb * a[32]) + r[32];
    t[1]  = (tb * a[33]) + r[33];
    t[2]  = (tb * a[34]) + r[34];
    t[3]  = (tb * a[35]) + r[35];
    r[32] = t[0] & 0x1fffffff;
    t[1] += t[0] >> 29;
    r[33] = t[1] & 0x1fffffff;
    t[2] += t[1] >> 29;
    r[34] = t[2] & 0x1fffffff;
    t[3] += t[2] >> 29;
    r[35] = t[3] & 0x1fffffff;
    r[36] +=  (sp_digit)(t[3] >> 29);
#endif /* WOLFSSL_SP_SMALL */
#endif /* !WOLFSSL_SP_LARGE_CODE */
}

/* Shift the result in the high 1024 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_2048_mont_shift_36(sp_digit* r, const sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    sp_int64 n = a[35] >> 9;
    n += ((sp_int64)a[36]) << 20;

    for (i = 0; i < 35; i++) {
        r[i] = n & 0x1fffffff;
        n >>= 29;
        n += ((sp_int64)a[37 + i]) << 20;
    }
    r[35] = (sp_digit)n;
#else
    int i;
    sp_int64 n = a[35] >> 9;
    n += ((sp_int64)a[36]) << 20;
    for (i = 0; i < 32; i += 8) {
        r[i + 0] = n & 0x1fffffff;
        n >>= 29; n += ((sp_int64)a[i + 37]) << 20;
        r[i + 1] = n & 0x1fffffff;
        n >>= 29; n += ((sp_int64)a[i + 38]) << 20;
        r[i + 2] = n & 0x1fffffff;
        n >>= 29; n += ((sp_int64)a[i + 39]) << 20;
        r[i + 3] = n & 0x1fffffff;
        n >>= 29; n += ((sp_int64)a[i + 40]) << 20;
        r[i + 4] = n & 0x1fffffff;
        n >>= 29; n += ((sp_int64)a[i + 41]) << 20;
        r[i + 5] = n & 0x1fffffff;
        n >>= 29; n += ((sp_int64)a[i + 42]) << 20;
        r[i + 6] = n & 0x1fffffff;
        n >>= 29; n += ((sp_int64)a[i + 43]) << 20;
        r[i + 7] = n & 0x1fffffff;
        n >>= 29; n += ((sp_int64)a[i + 44]) << 20;
    }
    r[32] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[69]) << 20;
    r[33] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[70]) << 20;
    r[34] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[71]) << 20;
    r[35] = (sp_digit)n;
#endif /* WOLFSSL_SP_SMALL */
    XMEMSET(&r[36], 0, sizeof(*r) * 36U);
}

/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_2048_mont_reduce_36(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;
    sp_digit over;

    sp_2048_norm_36(a + 36);

    for (i=0; i<35; i++) {
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x1fffffff;
        sp_2048_mul_add_36(a+i, m, mu);
        a[i+1] += a[i] >> 29;
    }
    mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x1ffL;
    sp_2048_mul_add_36(a+i, m, mu);
    a[i+1] += a[i] >> 29;
    a[i] &= 0x1fffffff;
    sp_2048_mont_shift_36(a, a);
    over = a[35] - m[35];
    sp_2048_cond_sub_36(a, a, m, ~((over - 1) >> 31));
    sp_2048_norm_36(a);
}

/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_2048_mont_mul_36(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_2048_mul_36(r, a, b);
    sp_2048_mont_reduce_36(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_2048_mont_sqr_36(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_2048_sqr_36(r, a);
    sp_2048_mont_reduce_36(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_2048_mul_d_36(sp_digit* r, const sp_digit* a,
    sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 36; i++) {
        t += tb * a[i];
        r[i] = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
    }
    r[36] = (sp_digit)t;
#else
    sp_int64 tb = b;
    sp_int64 t = 0;
    sp_digit t2;
    sp_int64 p[4];
    int i;

    for (i = 0; i < 36; i += 4) {
        p[0] = tb * a[i + 0];
        p[1] = tb * a[i + 1];
        p[2] = tb * a[i + 2];
        p[3] = tb * a[i + 3];
        t += p[0];
        t2 = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
        r[i + 0] = (sp_digit)t2;
        t += p[1];
        t2 = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
        r[i + 1] = (sp_digit)t2;
        t += p[2];
        t2 = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
        r[i + 2] = (sp_digit)t2;
        t += p[3];
        t2 = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
        r[i + 3] = (sp_digit)t2;
    }
    r[36] = (sp_digit)(t & 0x1fffffff);
#endif /* WOLFSSL_SP_SMALL */
}

#ifdef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_2048_cond_add_36(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 36; i++) {
        r[i] = a[i] + (b[i] & m);
    }
}
#endif /* WOLFSSL_SP_SMALL */

#ifndef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_2048_cond_add_36(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 32; i += 8) {
        r[i + 0] = a[i + 0] + (b[i + 0] & m);
        r[i + 1] = a[i + 1] + (b[i + 1] & m);
        r[i + 2] = a[i + 2] + (b[i + 2] & m);
        r[i + 3] = a[i + 3] + (b[i + 3] & m);
        r[i + 4] = a[i + 4] + (b[i + 4] & m);
        r[i + 5] = a[i + 5] + (b[i + 5] & m);
        r[i + 6] = a[i + 6] + (b[i + 6] & m);
        r[i + 7] = a[i + 7] + (b[i + 7] & m);
    }
    r[32] = a[32] + (b[32] & m);
    r[33] = a[33] + (b[33] & m);
    r[34] = a[34] + (b[34] & m);
    r[35] = a[35] + (b[35] & m);
}
#endif /* !WOLFSSL_SP_SMALL */

SP_NOINLINE static void sp_2048_rshift_36(sp_digit* r, const sp_digit* a,
        byte n)
{
    int i;

#ifdef WOLFSSL_SP_SMALL
    for (i=0; i<35; i++) {
        r[i] = ((a[i] >> n) | (a[i + 1] << (29 - n))) & 0x1fffffff;
    }
#else
    for (i=0; i<32; i += 8) {
        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (29 - n)) & 0x1fffffff);
        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (29 - n)) & 0x1fffffff);
        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (29 - n)) & 0x1fffffff);
        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (29 - n)) & 0x1fffffff);
        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (29 - n)) & 0x1fffffff);
        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (29 - n)) & 0x1fffffff);
        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (29 - n)) & 0x1fffffff);
        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (29 - n)) & 0x1fffffff);
    }
    r[32] = (a[32] >> n) | ((a[33] << (29 - n)) & 0x1fffffff);
    r[33] = (a[33] >> n) | ((a[34] << (29 - n)) & 0x1fffffff);
    r[34] = (a[34] >> n) | ((a[35] << (29 - n)) & 0x1fffffff);
#endif /* WOLFSSL_SP_SMALL */
    r[35] = a[35] >> n;
}

static WC_INLINE sp_digit sp_2048_div_word_36(sp_digit d1, sp_digit d0,
    sp_digit div)
{
#ifdef SP_USE_DIVTI3
    sp_int64 d = ((sp_int64)d1 << 29) + d0;

    return d / div;
#elif defined(__x86_64__) || defined(__i386__)
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_uint32 lo = (sp_uint32)d;
    sp_digit hi = (sp_digit)(d >> 32);

    __asm__ __volatile__ (
        "idiv %2"
        : "+a" (lo)
        : "d" (hi), "r" (div)
        : "cc"
    );

    return (sp_digit)lo;
#elif !defined(__aarch64__) &&  !defined(SP_DIV_WORD_USE_DIV)
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_digit dv = (div >> 1) + 1;
    sp_digit t1 = (sp_digit)(d >> 29);
    sp_digit t0 = (sp_digit)(d & 0x1fffffff);
    sp_digit t2;
    sp_digit sign;
    sp_digit r;
    int i;
    sp_int64 m;

    r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
    t1 -= dv & (0 - r);
    for (i = 27; i >= 1; i--) {
        t1 += t1 + (((sp_uint32)t0 >> 28) & 1);
        t0 <<= 1;
        t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
        r += r + t2;
        t1 -= dv & (0 - t2);
        t1 += t2;
    }
    r += r + 1;

    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 29);
    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 58) - (sp_digit)(d >> 58);

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;
   return r;
#else
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_digit r = 0;
    sp_digit t;
    sp_digit dv = (div >> 14) + 1;

    t = (sp_digit)(d >> 28);
    t = (t / dv) << 14;
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)(d >> 13);
    t = t / (dv << 1);
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)d;
    t = t / div;
    r += t;
    d -= (sp_int64)t * div;
    return r;
#endif
}
static WC_INLINE sp_digit sp_2048_word_div_word_36(sp_digit d, sp_digit div)
{
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \
    defined(SP_DIV_WORD_USE_DIV)
    return d / div;
#else
    return (sp_digit)((sp_uint32)(div - d) >> 31);
#endif
}
/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * Full implementation.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_2048_div_36(const sp_digit* a, const sp_digit* d,
        const sp_digit* m, sp_digit* r)
{
    int i;
#ifndef WOLFSSL_SP_DIV_32
#endif
    sp_digit dv;
    sp_digit r1;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* t1 = NULL;
#else
    sp_digit t1[4 * 36 + 3];
#endif
    sp_digit* t2 = NULL;
    sp_digit* sd = NULL;
    int err = MP_OKAY;

    (void)m;

#ifdef WOLFSSL_SP_SMALL_STACK
    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 36 + 3), NULL,
                                                       DYNAMIC_TYPE_TMP_BUFFER);
    if (t1 == NULL)
        err = MEMORY_E;
#endif

    (void)m;

    if (err == MP_OKAY) {
        t2 = t1 + 72 + 1;
        sd = t2 + 36 + 1;

        sp_2048_mul_d_36(sd, d, (sp_digit)1 << 20);
        sp_2048_mul_d_72(t1, a, (sp_digit)1 << 20);
        dv = sd[35];
        t1[36 + 36] += t1[36 + 36 - 1] >> 29;
        t1[36 + 36 - 1] &= 0x1fffffff;
        for (i=36; i>=0; i--) {
            r1 = sp_2048_div_word_36(t1[36 + i], t1[36 + i - 1], dv);

            sp_2048_mul_d_36(t2, sd, r1);
            (void)sp_2048_sub_36(&t1[i], &t1[i], t2);
            sp_2048_norm_36(&t1[i]);
            t1[36 + i] -= t2[36];
            t1[36 + i] += t1[36 + i - 1] >> 29;
            t1[36 + i - 1] &= 0x1fffffff;
            r1 = sp_2048_div_word_36(-t1[36 + i], -t1[36 + i - 1], dv);
            r1 -= t1[36 + i];
            sp_2048_mul_d_36(t2, sd, r1);
            (void)sp_2048_add_36(&t1[i], &t1[i], t2);
            t1[36 + i] += t1[36 + i - 1] >> 29;
            t1[36 + i - 1] &= 0x1fffffff;
        }
        t1[36 - 1] += t1[36 - 2] >> 29;
        t1[36 - 2] &= 0x1fffffff;
        r1 = sp_2048_word_div_word_36(t1[36 - 1], dv);

        sp_2048_mul_d_36(t2, sd, r1);
        sp_2048_sub_36(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 72U);
        for (i=0; i<35; i++) {
            r[i+1] += r[i] >> 29;
            r[i] &= 0x1fffffff;
        }
        sp_2048_cond_add_36(r, r, sd, r[35] >> 31);

        sp_2048_norm_36(r);
        sp_2048_rshift_36(r, r, 20);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t1 != NULL)
        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_2048_mod_36(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_2048_div_36(a, m, NULL, r);
}

/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_2048_mod_exp_36(sp_digit* r, const sp_digit* a, const sp_digit* e,
    int bits, const sp_digit* m, int reduceA)
{
#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 72];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 36 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 36 * 2);
            XMEMSET(t[i], 0, sizeof(sp_digit) * 36U * 2U);
        }

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_36(norm, m);

        if (reduceA != 0) {
            err = sp_2048_mod_36(t[1], a, m);
        }
        else {
            XMEMCPY(t[1], a, sizeof(sp_digit) * 36U);
        }
    }
    if (err == MP_OKAY) {
        sp_2048_mul_36(t[1], t[1], norm);
        err = sp_2048_mod_36(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 29;
        c = bits % 29;
        n = e[i--] << (29 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 29;
            }

            y = (int)((n >> 28) & 1);
            n <<= 1;

            sp_2048_mont_mul_36(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 36 * 2);
            sp_2048_mont_sqr_36(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 36 * 2);
        }

        sp_2048_mont_reduce_36(t[0], m, mp);
        n = sp_2048_cmp_36(t[0], m);
        sp_2048_cond_sub_36(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 36 * 2);

    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#elif !defined(WC_NO_CACHE_RESISTANT)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 72];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 36 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 36 * 2);
        }

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_36(norm, m);

        if (reduceA != 0) {
            err = sp_2048_mod_36(t[1], a, m);
            if (err == MP_OKAY) {
                sp_2048_mul_36(t[1], t[1], norm);
                err = sp_2048_mod_36(t[1], t[1], m);
            }
        }
        else {
            sp_2048_mul_36(t[1], a, norm);
            err = sp_2048_mod_36(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 29;
        c = bits % 29;
        n = e[i--] << (29 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 29;
            }

            y = (int)((n >> 28) & 1);
            n <<= 1;

            sp_2048_mont_mul_36(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 36 * 2);
            sp_2048_mont_sqr_36(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 36 * 2);
        }

        sp_2048_mont_reduce_36(t[0], m, mp);
        n = sp_2048_cmp_36(t[0], m);
        sp_2048_cond_sub_36(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 36 * 2);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[(32 * 72) + 72];
#endif
    sp_digit* t[32];
    sp_digit* rt = NULL;
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 72) + 72), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<32; i++)
            t[i] = td + i * 72;
        rt = td + 2304;

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_36(norm, m);

        if (reduceA != 0) {
            err = sp_2048_mod_36(t[1], a, m);
            if (err == MP_OKAY) {
                sp_2048_mul_36(t[1], t[1], norm);
                err = sp_2048_mod_36(t[1], t[1], m);
            }
        }
        else {
            sp_2048_mul_36(t[1], a, norm);
            err = sp_2048_mod_36(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_36(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_36(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_36(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_36(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_36(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_36(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_36(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_36(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_36(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_36(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_36(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_36(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_36(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_36(t[15], t[ 8], t[ 7], m, mp);
        sp_2048_mont_sqr_36(t[16], t[ 8], m, mp);
        sp_2048_mont_mul_36(t[17], t[ 9], t[ 8], m, mp);
        sp_2048_mont_sqr_36(t[18], t[ 9], m, mp);
        sp_2048_mont_mul_36(t[19], t[10], t[ 9], m, mp);
        sp_2048_mont_sqr_36(t[20], t[10], m, mp);
        sp_2048_mont_mul_36(t[21], t[11], t[10], m, mp);
        sp_2048_mont_sqr_36(t[22], t[11], m, mp);
        sp_2048_mont_mul_36(t[23], t[12], t[11], m, mp);
        sp_2048_mont_sqr_36(t[24], t[12], m, mp);
        sp_2048_mont_mul_36(t[25], t[13], t[12], m, mp);
        sp_2048_mont_sqr_36(t[26], t[13], m, mp);
        sp_2048_mont_mul_36(t[27], t[14], t[13], m, mp);
        sp_2048_mont_sqr_36(t[28], t[14], m, mp);
        sp_2048_mont_mul_36(t[29], t[15], t[14], m, mp);
        sp_2048_mont_sqr_36(t[30], t[15], m, mp);
        sp_2048_mont_mul_36(t[31], t[16], t[15], m, mp);

        bits = ((bits + 4) / 5) * 5;
        i = ((bits + 28) / 29) - 1;
        c = bits % 29;
        if (c == 0) {
            c = 29;
        }
        if (i < 36) {
            n = e[i--] << (32 - c);
        }
        else {
            n = 0;
            i--;
        }
        if (c < 5) {
            n |= e[i--] << (3 - c);
            c += 29;
        }
        y = (int)((n >> 27) & 0x1f);
        n <<= 5;
        c -= 5;
        XMEMCPY(rt, t[y], sizeof(sp_digit) * 72);
        while ((i >= 0) || (c >= 5)) {
            if (c >= 5) {
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c -= 5;
            }
            else if (c == 0) {
                n = e[i--] << 3;
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c = 24;
            }
            else {
                y = (byte)((n >> 27) & 0x1f);
                n = e[i--] << 3;
                c = 5 - c;
                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
                n <<= c;
                c = 29 - c;
            }

            sp_2048_mont_sqr_36(rt, rt, m, mp);
            sp_2048_mont_sqr_36(rt, rt, m, mp);
            sp_2048_mont_sqr_36(rt, rt, m, mp);
            sp_2048_mont_sqr_36(rt, rt, m, mp);
            sp_2048_mont_sqr_36(rt, rt, m, mp);

            sp_2048_mont_mul_36(rt, rt, t[y], m, mp);
        }

        sp_2048_mont_reduce_36(rt, m, mp);
        n = sp_2048_cmp_36(rt, m);
        sp_2048_cond_sub_36(rt, rt, m, ~(n >> 31));
        XMEMCPY(r, rt, sizeof(sp_digit) * 72);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}

#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 2048 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_2048_mont_norm_72(sp_digit* r, const sp_digit* m)
{
    /* Set r = 2^n - 1. */
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<70; i++) {
        r[i] = 0x1fffffff;
    }
#else
    int i;

    for (i = 0; i < 64; i += 8) {
        r[i + 0] = 0x1fffffff;
        r[i + 1] = 0x1fffffff;
        r[i + 2] = 0x1fffffff;
        r[i + 3] = 0x1fffffff;
        r[i + 4] = 0x1fffffff;
        r[i + 5] = 0x1fffffff;
        r[i + 6] = 0x1fffffff;
        r[i + 7] = 0x1fffffff;
    }
    r[64] = 0x1fffffff;
    r[65] = 0x1fffffff;
    r[66] = 0x1fffffff;
    r[67] = 0x1fffffff;
    r[68] = 0x1fffffff;
    r[69] = 0x1fffffff;
#endif /* WOLFSSL_SP_SMALL */
    r[70] = 0x3ffffL;
    r[71] = 0;

    /* r = (2^n - 1) mod n */
    (void)sp_2048_sub_72(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_2048_cmp_72(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=71; i>=0; i--) {
        r |= (a[i] - b[i]) & ~(((sp_digit)0 - r) >> 28);
    }
#else
    int i;

    for (i = 64; i >= 0; i -= 8) {
        r |= (a[i + 7] - b[i + 7]) & ~(((sp_digit)0 - r) >> 28);
        r |= (a[i + 6] - b[i + 6]) & ~(((sp_digit)0 - r) >> 28);
        r |= (a[i + 5] - b[i + 5]) & ~(((sp_digit)0 - r) >> 28);
        r |= (a[i + 4] - b[i + 4]) & ~(((sp_digit)0 - r) >> 28);
        r |= (a[i + 3] - b[i + 3]) & ~(((sp_digit)0 - r) >> 28);
        r |= (a[i + 2] - b[i + 2]) & ~(((sp_digit)0 - r) >> 28);
        r |= (a[i + 1] - b[i + 1]) & ~(((sp_digit)0 - r) >> 28);
        r |= (a[i + 0] - b[i + 0]) & ~(((sp_digit)0 - r) >> 28);
    }
#endif /* WOLFSSL_SP_SMALL */

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_2048_cond_sub_72(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 72; i++) {
        r[i] = a[i] - (b[i] & m);
    }
#else
    int i;

    for (i = 0; i < 72; i += 8) {
        r[i + 0] = a[i + 0] - (b[i + 0] & m);
        r[i + 1] = a[i + 1] - (b[i + 1] & m);
        r[i + 2] = a[i + 2] - (b[i + 2] & m);
        r[i + 3] = a[i + 3] - (b[i + 3] & m);
        r[i + 4] = a[i + 4] - (b[i + 4] & m);
        r[i + 5] = a[i + 5] - (b[i + 5] & m);
        r[i + 6] = a[i + 6] - (b[i + 6] & m);
        r[i + 7] = a[i + 7] - (b[i + 7] & m);
    }
#endif /* WOLFSSL_SP_SMALL */
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_2048_mul_add_72(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifndef WOLFSSL_SP_LARGE_CODE
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 72; i++) {
        t += r[i];
        t += tb * a[i];
        r[i] = ((sp_digit)t) & 0x1fffffff;
        t >>= 29;
    }
    r[72] += (sp_digit)t;
#else
#ifdef WOLFSSL_SP_SMALL
    sp_int64 tb = b;
    sp_int64 t[4];
    int i;

    t[0] = 0;
    for (i = 0; i < 68; i += 4) {
        t[0] += (tb * a[i+0]) + r[i+0];
        t[1]  = (tb * a[i+1]) + r[i+1];
        t[2]  = (tb * a[i+2]) + r[i+2];
        t[3]  = (tb * a[i+3]) + r[i+3];
        r[i+0] = t[0] & 0x1fffffff;
        t[1] += t[0] >> 29;
        r[i+1] = t[1] & 0x1fffffff;
        t[2] += t[1] >> 29;
        r[i+2] = t[2] & 0x1fffffff;
        t[3] += t[2] >> 29;
        r[i+3] = t[3] & 0x1fffffff;
        t[0]  = t[3] >> 29;
    }
    t[0] += (tb * a[68]) + r[68];
    t[1]  = (tb * a[69]) + r[69];
    t[2]  = (tb * a[70]) + r[70];
    t[3]  = (tb * a[71]) + r[71];
    r[68] = t[0] & 0x1fffffff;
    t[1] += t[0] >> 29;
    r[69] = t[1] & 0x1fffffff;
    t[2] += t[1] >> 29;
    r[70] = t[2] & 0x1fffffff;
    t[3] += t[2] >> 29;
    r[71] = t[3] & 0x1fffffff;
    r[72] +=  (sp_digit)(t[3] >> 29);
#else
    sp_int64 tb = b;
    sp_int64 t[8];
    int i;

    t[0] = 0;
    for (i = 0; i < 64; i += 8) {
        t[0] += (tb * a[i+0]) + r[i+0];
        t[1]  = (tb * a[i+1]) + r[i+1];
        t[2]  = (tb * a[i+2]) + r[i+2];
        t[3]  = (tb * a[i+3]) + r[i+3];
        t[4]  = (tb * a[i+4]) + r[i+4];
        t[5]  = (tb * a[i+5]) + r[i+5];
        t[6]  = (tb * a[i+6]) + r[i+6];
        t[7]  = (tb * a[i+7]) + r[i+7];
        r[i+0] = t[0] & 0x1fffffff;
        t[1] += t[0] >> 29;
        r[i+1] = t[1] & 0x1fffffff;
        t[2] += t[1] >> 29;
        r[i+2] = t[2] & 0x1fffffff;
        t[3] += t[2] >> 29;
        r[i+3] = t[3] & 0x1fffffff;
        t[4] += t[3] >> 29;
        r[i+4] = t[4] & 0x1fffffff;
        t[5] += t[4] >> 29;
        r[i+5] = t[5] & 0x1fffffff;
        t[6] += t[5] >> 29;
        r[i+6] = t[6] & 0x1fffffff;
        t[7] += t[6] >> 29;
        r[i+7] = t[7] & 0x1fffffff;
        t[0]  = t[7] >> 29;
    }
    t[0] += (tb * a[64]) + r[64];
    t[1]  = (tb * a[65]) + r[65];
    t[2]  = (tb * a[66]) + r[66];
    t[3]  = (tb * a[67]) + r[67];
    t[4]  = (tb * a[68]) + r[68];
    t[5]  = (tb * a[69]) + r[69];
    t[6]  = (tb * a[70]) + r[70];
    t[7]  = (tb * a[71]) + r[71];
    r[64] = t[0] & 0x1fffffff;
    t[1] += t[0] >> 29;
    r[65] = t[1] & 0x1fffffff;
    t[2] += t[1] >> 29;
    r[66] = t[2] & 0x1fffffff;
    t[3] += t[2] >> 29;
    r[67] = t[3] & 0x1fffffff;
    t[4] += t[3] >> 29;
    r[68] = t[4] & 0x1fffffff;
    t[5] += t[4] >> 29;
    r[69] = t[5] & 0x1fffffff;
    t[6] += t[5] >> 29;
    r[70] = t[6] & 0x1fffffff;
    t[7] += t[6] >> 29;
    r[71] = t[7] & 0x1fffffff;
    r[72] +=  (sp_digit)(t[7] >> 29);
#endif /* WOLFSSL_SP_SMALL */
#endif /* !WOLFSSL_SP_LARGE_CODE */
}

/* Shift the result in the high 2048 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_2048_mont_shift_72(sp_digit* r, const sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    sp_int64 n = a[70] >> 18;
    n += ((sp_int64)a[71]) << 11;

    for (i = 0; i < 70; i++) {
        r[i] = n & 0x1fffffff;
        n >>= 29;
        n += ((sp_int64)a[72 + i]) << 11;
    }
    r[70] = (sp_digit)n;
#else
    int i;
    sp_int64 n = a[70] >> 18;
    n += ((sp_int64)a[71]) << 11;
    for (i = 0; i < 64; i += 8) {
        r[i + 0] = n & 0x1fffffff;
        n >>= 29; n += ((sp_int64)a[i + 72]) << 11;
        r[i + 1] = n & 0x1fffffff;
        n >>= 29; n += ((sp_int64)a[i + 73]) << 11;
        r[i + 2] = n & 0x1fffffff;
        n >>= 29; n += ((sp_int64)a[i + 74]) << 11;
        r[i + 3] = n & 0x1fffffff;
        n >>= 29; n += ((sp_int64)a[i + 75]) << 11;
        r[i + 4] = n & 0x1fffffff;
        n >>= 29; n += ((sp_int64)a[i + 76]) << 11;
        r[i + 5] = n & 0x1fffffff;
        n >>= 29; n += ((sp_int64)a[i + 77]) << 11;
        r[i + 6] = n & 0x1fffffff;
        n >>= 29; n += ((sp_int64)a[i + 78]) << 11;
        r[i + 7] = n & 0x1fffffff;
        n >>= 29; n += ((sp_int64)a[i + 79]) << 11;
    }
    r[64] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[136]) << 11;
    r[65] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[137]) << 11;
    r[66] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[138]) << 11;
    r[67] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[139]) << 11;
    r[68] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[140]) << 11;
    r[69] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[141]) << 11;
    r[70] = (sp_digit)n;
#endif /* WOLFSSL_SP_SMALL */
    XMEMSET(&r[71], 0, sizeof(*r) * 71U);
}

/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_2048_mont_reduce_72(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;
    sp_digit over;

    sp_2048_norm_72(a + 71);

#ifdef WOLFSSL_SP_DH
    if (mp != 1) {
        for (i=0; i<70; i++) {
            mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x1fffffff;
            sp_2048_mul_add_72(a+i, m, mu);
            a[i+1] += a[i] >> 29;
        }
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x3ffffL;
        sp_2048_mul_add_72(a+i, m, mu);
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
    else {
        for (i=0; i<70; i++) {
            mu = a[i] & 0x1fffffff;
            sp_2048_mul_add_72(a+i, m, mu);
            a[i+1] += a[i] >> 29;
        }
        mu = a[i] & 0x3ffffL;
        sp_2048_mul_add_72(a+i, m, mu);
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
#else
    for (i=0; i<70; i++) {
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x1fffffff;
        sp_2048_mul_add_72(a+i, m, mu);
        a[i+1] += a[i] >> 29;
    }
    mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x3ffffL;
    sp_2048_mul_add_72(a+i, m, mu);
    a[i+1] += a[i] >> 29;
    a[i] &= 0x1fffffff;
#endif
    sp_2048_mont_shift_72(a, a);
    over = a[70] - m[70];
    sp_2048_cond_sub_72(a, a, m, ~((over - 1) >> 31));
    sp_2048_norm_72(a);
}

/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_2048_mont_mul_72(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_2048_mul_72(r, a, b);
    sp_2048_mont_reduce_72(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_2048_mont_sqr_72(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_2048_sqr_72(r, a);
    sp_2048_mont_reduce_72(r, m, mp);
}

/* Normalize the values in each word to 29 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_2048_norm_71(sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    for (i = 0; i < 70; i++) {
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
#else
    int i;
    for (i = 0; i < 64; i += 8) {
        a[i+1] += a[i+0] >> 29; a[i+0] &= 0x1fffffff;
        a[i+2] += a[i+1] >> 29; a[i+1] &= 0x1fffffff;
        a[i+3] += a[i+2] >> 29; a[i+2] &= 0x1fffffff;
        a[i+4] += a[i+3] >> 29; a[i+3] &= 0x1fffffff;
        a[i+5] += a[i+4] >> 29; a[i+4] &= 0x1fffffff;
        a[i+6] += a[i+5] >> 29; a[i+5] &= 0x1fffffff;
        a[i+7] += a[i+6] >> 29; a[i+6] &= 0x1fffffff;
        a[i+8] += a[i+7] >> 29; a[i+7] &= 0x1fffffff;
    }
    a[65] += a[64] >> 29; a[64] &= 0x1fffffff;
    a[66] += a[65] >> 29; a[65] &= 0x1fffffff;
    a[67] += a[66] >> 29; a[66] &= 0x1fffffff;
    a[68] += a[67] >> 29; a[67] &= 0x1fffffff;
    a[69] += a[68] >> 29; a[68] &= 0x1fffffff;
    a[70] += a[69] >> 29; a[69] &= 0x1fffffff;
#endif /* WOLFSSL_SP_SMALL */
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_2048_mul_d_144(sp_digit* r, const sp_digit* a,
    sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 144; i++) {
        t += tb * a[i];
        r[i] = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
    }
    r[144] = (sp_digit)t;
#else
    sp_int64 tb = b;
    sp_int64 t = 0;
    sp_digit t2;
    sp_int64 p[4];
    int i;

    for (i = 0; i < 144; i += 4) {
        p[0] = tb * a[i + 0];
        p[1] = tb * a[i + 1];
        p[2] = tb * a[i + 2];
        p[3] = tb * a[i + 3];
        t += p[0];
        t2 = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
        r[i + 0] = (sp_digit)t2;
        t += p[1];
        t2 = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
        r[i + 1] = (sp_digit)t2;
        t += p[2];
        t2 = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
        r[i + 2] = (sp_digit)t2;
        t += p[3];
        t2 = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
        r[i + 3] = (sp_digit)t2;
    }
    r[144] = (sp_digit)(t & 0x1fffffff);
#endif /* WOLFSSL_SP_SMALL */
}

#ifdef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_2048_cond_add_72(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 72; i++) {
        r[i] = a[i] + (b[i] & m);
    }
}
#endif /* WOLFSSL_SP_SMALL */

#ifndef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_2048_cond_add_72(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 72; i += 8) {
        r[i + 0] = a[i + 0] + (b[i + 0] & m);
        r[i + 1] = a[i + 1] + (b[i + 1] & m);
        r[i + 2] = a[i + 2] + (b[i + 2] & m);
        r[i + 3] = a[i + 3] + (b[i + 3] & m);
        r[i + 4] = a[i + 4] + (b[i + 4] & m);
        r[i + 5] = a[i + 5] + (b[i + 5] & m);
        r[i + 6] = a[i + 6] + (b[i + 6] & m);
        r[i + 7] = a[i + 7] + (b[i + 7] & m);
    }
}
#endif /* !WOLFSSL_SP_SMALL */

SP_NOINLINE static void sp_2048_rshift_72(sp_digit* r, const sp_digit* a,
        byte n)
{
    int i;

#ifdef WOLFSSL_SP_SMALL
    for (i=0; i<71; i++) {
        r[i] = ((a[i] >> n) | (a[i + 1] << (29 - n))) & 0x1fffffff;
    }
#else
    for (i=0; i<64; i += 8) {
        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (29 - n)) & 0x1fffffff);
        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (29 - n)) & 0x1fffffff);
        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (29 - n)) & 0x1fffffff);
        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (29 - n)) & 0x1fffffff);
        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (29 - n)) & 0x1fffffff);
        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (29 - n)) & 0x1fffffff);
        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (29 - n)) & 0x1fffffff);
        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (29 - n)) & 0x1fffffff);
    }
    r[64] = (a[64] >> n) | ((a[65] << (29 - n)) & 0x1fffffff);
    r[65] = (a[65] >> n) | ((a[66] << (29 - n)) & 0x1fffffff);
    r[66] = (a[66] >> n) | ((a[67] << (29 - n)) & 0x1fffffff);
    r[67] = (a[67] >> n) | ((a[68] << (29 - n)) & 0x1fffffff);
    r[68] = (a[68] >> n) | ((a[69] << (29 - n)) & 0x1fffffff);
    r[69] = (a[69] >> n) | ((a[70] << (29 - n)) & 0x1fffffff);
    r[70] = (a[70] >> n) | ((a[71] << (29 - n)) & 0x1fffffff);
#endif /* WOLFSSL_SP_SMALL */
    r[71] = a[71] >> n;
}

static WC_INLINE sp_digit sp_2048_div_word_72(sp_digit d1, sp_digit d0,
    sp_digit div)
{
#ifdef SP_USE_DIVTI3
    sp_int64 d = ((sp_int64)d1 << 29) + d0;

    return d / div;
#elif defined(__x86_64__) || defined(__i386__)
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_uint32 lo = (sp_uint32)d;
    sp_digit hi = (sp_digit)(d >> 32);

    __asm__ __volatile__ (
        "idiv %2"
        : "+a" (lo)
        : "d" (hi), "r" (div)
        : "cc"
    );

    return (sp_digit)lo;
#elif !defined(__aarch64__) &&  !defined(SP_DIV_WORD_USE_DIV)
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_digit dv = (div >> 1) + 1;
    sp_digit t1 = (sp_digit)(d >> 29);
    sp_digit t0 = (sp_digit)(d & 0x1fffffff);
    sp_digit t2;
    sp_digit sign;
    sp_digit r;
    int i;
    sp_int64 m;

    r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
    t1 -= dv & (0 - r);
    for (i = 27; i >= 1; i--) {
        t1 += t1 + (((sp_uint32)t0 >> 28) & 1);
        t0 <<= 1;
        t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
        r += r + t2;
        t1 -= dv & (0 - t2);
        t1 += t2;
    }
    r += r + 1;

    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 29);
    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 58) - (sp_digit)(d >> 58);

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;
   return r;
#else
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_digit r = 0;
    sp_digit t;
    sp_digit dv = (div >> 14) + 1;

    t = (sp_digit)(d >> 28);
    t = (t / dv) << 14;
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)(d >> 13);
    t = t / (dv << 1);
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)d;
    t = t / div;
    r += t;
    d -= (sp_int64)t * div;
    return r;
#endif
}
static WC_INLINE sp_digit sp_2048_word_div_word_72(sp_digit d, sp_digit div)
{
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \
    defined(SP_DIV_WORD_USE_DIV)
    return d / div;
#else
    return (sp_digit)((sp_uint32)(div - d) >> 31);
#endif
}
/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * Full implementation.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_2048_div_72(const sp_digit* a, const sp_digit* d,
        const sp_digit* m, sp_digit* r)
{
    int i;
#ifndef WOLFSSL_SP_DIV_32
#endif
    sp_digit dv;
    sp_digit r1;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* t1 = NULL;
#else
    sp_digit t1[4 * 72 + 3];
#endif
    sp_digit* t2 = NULL;
    sp_digit* sd = NULL;
    int err = MP_OKAY;

    (void)m;

#ifdef WOLFSSL_SP_SMALL_STACK
    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 72 + 3), NULL,
                                                       DYNAMIC_TYPE_TMP_BUFFER);
    if (t1 == NULL)
        err = MEMORY_E;
#endif

    (void)m;

    if (err == MP_OKAY) {
        t2 = t1 + 144 + 1;
        sd = t2 + 72 + 1;

        sp_2048_mul_d_72(sd, d, (sp_digit)1 << 11);
        sp_2048_mul_d_144(t1, a, (sp_digit)1 << 11);
        dv = sd[70];
        t1[71 + 71] += t1[71 + 71 - 1] >> 29;
        t1[71 + 71 - 1] &= 0x1fffffff;
        for (i=71; i>=0; i--) {
            r1 = sp_2048_div_word_72(t1[71 + i], t1[71 + i - 1], dv);

            sp_2048_mul_d_72(t2, sd, r1);
            (void)sp_2048_sub_72(&t1[i], &t1[i], t2);
            sp_2048_norm_71(&t1[i]);
            t1[71 + i] += t1[71 + i - 1] >> 29;
            t1[71 + i - 1] &= 0x1fffffff;
            r1 = sp_2048_div_word_72(-t1[71 + i], -t1[71 + i - 1], dv);
            r1 -= t1[71 + i];
            sp_2048_mul_d_72(t2, sd, r1);
            (void)sp_2048_add_72(&t1[i], &t1[i], t2);
            t1[71 + i] += t1[71 + i - 1] >> 29;
            t1[71 + i - 1] &= 0x1fffffff;
        }
        t1[71 - 1] += t1[71 - 2] >> 29;
        t1[71 - 2] &= 0x1fffffff;
        r1 = sp_2048_word_div_word_72(t1[71 - 1], dv);

        sp_2048_mul_d_72(t2, sd, r1);
        sp_2048_sub_72(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 144U);
        for (i=0; i<70; i++) {
            r[i+1] += r[i] >> 29;
            r[i] &= 0x1fffffff;
        }
        sp_2048_cond_add_72(r, r, sd, r[70] >> 31);

        sp_2048_norm_71(r);
        sp_2048_rshift_72(r, r, 11);
        r[71] = 0;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t1 != NULL)
        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_2048_mod_72(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_2048_div_72(a, m, NULL, r);
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
                                                     defined(WOLFSSL_HAVE_SP_DH)
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_2048_mod_exp_72(sp_digit* r, const sp_digit* a, const sp_digit* e,
    int bits, const sp_digit* m, int reduceA)
{
#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 144];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 72 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 72 * 2);
            XMEMSET(t[i], 0, sizeof(sp_digit) * 72U * 2U);
        }

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_72(norm, m);

        if (reduceA != 0) {
            err = sp_2048_mod_72(t[1], a, m);
        }
        else {
            XMEMCPY(t[1], a, sizeof(sp_digit) * 72U);
        }
    }
    if (err == MP_OKAY) {
        sp_2048_mul_72(t[1], t[1], norm);
        err = sp_2048_mod_72(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 29;
        c = bits % 29;
        n = e[i--] << (29 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 29;
            }

            y = (int)((n >> 28) & 1);
            n <<= 1;

            sp_2048_mont_mul_72(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 72 * 2);
            sp_2048_mont_sqr_72(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 72 * 2);
        }

        sp_2048_mont_reduce_72(t[0], m, mp);
        n = sp_2048_cmp_72(t[0], m);
        sp_2048_cond_sub_72(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 72 * 2);

    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#elif !defined(WC_NO_CACHE_RESISTANT)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 144];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 72 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 72 * 2);
        }

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_72(norm, m);

        if (reduceA != 0) {
            err = sp_2048_mod_72(t[1], a, m);
            if (err == MP_OKAY) {
                sp_2048_mul_72(t[1], t[1], norm);
                err = sp_2048_mod_72(t[1], t[1], m);
            }
        }
        else {
            sp_2048_mul_72(t[1], a, norm);
            err = sp_2048_mod_72(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 29;
        c = bits % 29;
        n = e[i--] << (29 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 29;
            }

            y = (int)((n >> 28) & 1);
            n <<= 1;

            sp_2048_mont_mul_72(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 72 * 2);
            sp_2048_mont_sqr_72(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 72 * 2);
        }

        sp_2048_mont_reduce_72(t[0], m, mp);
        n = sp_2048_cmp_72(t[0], m);
        sp_2048_cond_sub_72(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 72 * 2);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[(16 * 144) + 144];
#endif
    sp_digit* t[16];
    sp_digit* rt = NULL;
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((16 * 144) + 144), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<16; i++)
            t[i] = td + i * 144;
        rt = td + 2304;

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_72(norm, m);

        if (reduceA != 0) {
            err = sp_2048_mod_72(t[1], a, m);
            if (err == MP_OKAY) {
                sp_2048_mul_72(t[1], t[1], norm);
                err = sp_2048_mod_72(t[1], t[1], m);
            }
        }
        else {
            sp_2048_mul_72(t[1], a, norm);
            err = sp_2048_mod_72(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_72(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_72(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_72(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_72(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_72(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_72(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_72(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_72(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_72(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_72(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_72(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_72(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_72(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_72(t[15], t[ 8], t[ 7], m, mp);

        bits = ((bits + 3) / 4) * 4;
        i = ((bits + 28) / 29) - 1;
        c = bits % 29;
        if (c == 0) {
            c = 29;
        }
        if (i < 72) {
            n = e[i--] << (32 - c);
        }
        else {
            n = 0;
            i--;
        }
        if (c < 4) {
            n |= e[i--] << (3 - c);
            c += 29;
        }
        y = (int)((n >> 28) & 0xf);
        n <<= 4;
        c -= 4;
        XMEMCPY(rt, t[y], sizeof(sp_digit) * 144);
        while ((i >= 0) || (c >= 4)) {
            if (c >= 4) {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }
            else if (c == 0) {
                n = e[i--] << 3;
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c = 25;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n = e[i--] << 3;
                c = 4 - c;
                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
                n <<= c;
                c = 29 - c;
            }

            sp_2048_mont_sqr_72(rt, rt, m, mp);
            sp_2048_mont_sqr_72(rt, rt, m, mp);
            sp_2048_mont_sqr_72(rt, rt, m, mp);
            sp_2048_mont_sqr_72(rt, rt, m, mp);

            sp_2048_mont_mul_72(rt, rt, t[y], m, mp);
        }

        sp_2048_mont_reduce_72(rt, m, mp);
        n = sp_2048_cmp_72(rt, m);
        sp_2048_cond_sub_72(rt, rt, m, ~(n >> 31));
        XMEMCPY(r, rt, sizeof(sp_digit) * 144);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}
#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) || */
       /* WOLFSSL_HAVE_SP_DH */

#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
#ifdef WOLFSSL_HAVE_SP_RSA
/* RSA public key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * em      Public exponent.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 256 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPublic_2048(const byte* in, word32 inLen, const mp_int* em,
    const mp_int* mm, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[72 * 5];
#endif
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    sp_digit* norm = NULL;
    sp_uint64 e[1] = {0};
    sp_digit mp = 0;
    int i;
    int err = MP_OKAY;

    if (*outLen < 256U) {
        err = MP_TO_E;
    }

    if (err == MP_OKAY) {
        if (mp_count_bits(em) > 64) {
            err = MP_READ_E;
        }
        else if (inLen > 256U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 2048) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 72 * 5, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        r = a + 72 * 2;
        m = r + 72 * 2;
        norm = r;

        sp_2048_from_bin(a, 72, in, inLen);
#if DIGIT_BIT >= 64
        e[0] = (sp_uint64)em->dp[0];
#else
        e[0] = (sp_uint64)em->dp[0];
        if (em->used > 1) {
            e[0] |= ((sp_uint64)em->dp[1]) << DIGIT_BIT;
        }
#endif
        if (e[0] == 0) {
            err = MP_EXPTMOD_E;
        }
    }

    if (err == MP_OKAY) {
        sp_2048_from_mp(m, 72, mm);

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_72(norm, m);
    }
    if (err == MP_OKAY) {
        sp_2048_mul_72(a, a, norm);
        err = sp_2048_mod_72(a, a, m);
    }
    if (err == MP_OKAY) {
        for (i=63; i>=0; i--) {
            if ((e[0] >> i) != 0) {
                break;
            }
        }

        XMEMCPY(r, a, sizeof(sp_digit) * 72 * 2);
        for (i--; i>=0; i--) {
            sp_2048_mont_sqr_72(r, r, m, mp);

            if (((e[0] >> i) & 1) == 1) {
                sp_2048_mont_mul_72(r, r, a, m, mp);
            }
        }
        sp_2048_mont_reduce_72(r, m, mp);
        mp = sp_2048_cmp_72(r, m);
        sp_2048_cond_sub_72(r, r, m, ~(mp >> 31));

        sp_2048_to_bin_72(r, out);
        *outLen = 256;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (a != NULL)
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
#endif

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit d[72 * 5];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    sp_uint64 e[1] = {0};
    int err = MP_OKAY;

    if (*outLen < 256U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(em) > 64) {
            err = MP_READ_E;
        }
        else if (inLen > 256U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 2048) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 72 * 5, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d;
        r = a + 72 * 2;
        m = r + 72 * 2;

        sp_2048_from_bin(a, 72, in, inLen);
#if DIGIT_BIT >= 64
        e[0] = (sp_uint64)em->dp[0];
#else
        e[0] = (sp_uint64)em->dp[0];
        if (em->used > 1) {
            e[0] |= ((sp_uint64)em->dp[1]) << DIGIT_BIT;
        }
#endif
        if (e[0] == 0) {
            err = MP_EXPTMOD_E;
        }
    }
    if (err == MP_OKAY) {
        sp_2048_from_mp(m, 72, mm);

        if (e[0] == 0x3) {
            sp_2048_sqr_72(r, a);
            err = sp_2048_mod_72(r, r, m);
            if (err == MP_OKAY) {
                sp_2048_mul_72(r, a, r);
                err = sp_2048_mod_72(r, r, m);
            }
        }
        else {
            sp_digit* norm = r;
            int i;
            sp_digit mp;

            sp_2048_mont_setup(m, &mp);
            sp_2048_mont_norm_72(norm, m);

            sp_2048_mul_72(a, a, norm);
            err = sp_2048_mod_72(a, a, m);

            if (err == MP_OKAY) {
                for (i=63; i>=0; i--) {
                    if ((e[0] >> i) != 0) {
                        break;
                    }
                }

                XMEMCPY(r, a, sizeof(sp_digit) * 144U);
                for (i--; i>=0; i--) {
                    sp_2048_mont_sqr_72(r, r, m, mp);

                    if (((e[0] >> i) & 1) == 1) {
                        sp_2048_mont_mul_72(r, r, a, m, mp);
                    }
                }
                sp_2048_mont_reduce_72(r, m, mp);
                mp = sp_2048_cmp_72(r, m);
                sp_2048_cond_sub_72(r, r, m, ~(mp >> 31));
            }
        }
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin_72(r, out);
        *outLen = 256;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif

    return err;
#endif /* WOLFSSL_SP_SMALL */
}

#ifndef WOLFSSL_RSA_PUBLIC_ONLY
#if !defined(SP_RSA_PRIVATE_EXP_D) && !defined(RSA_LOW_MEM)
#endif /* !SP_RSA_PRIVATE_EXP_D & !RSA_LOW_MEM */
/* RSA private key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * dm      Private exponent.
 * pm      First prime.
 * qm      Second prime.
 * dpm     First prime's CRT exponent.
 * dqm     Second prime's CRT exponent.
 * qim     Inverse of second prime mod p.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 256 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPrivate_2048(const byte* in, word32 inLen, const mp_int* dm,
    const mp_int* pm, const mp_int* qm, const mp_int* dpm, const mp_int* dqm,
    const mp_int* qim, const mp_int* mm, byte* out, word32* outLen)
{
#if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM)
#if defined(WOLFSSL_SP_SMALL)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit  d[72 * 4];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 256U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(dm) > 2048) {
           err = MP_READ_E;
        }
        else if (inLen > 256) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 2048) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 72 * 4, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d + 72;
        m = a + 144;
        r = a;

        sp_2048_from_bin(a, 72, in, inLen);
        sp_2048_from_mp(d, 72, dm);
        sp_2048_from_mp(m, 72, mm);
        err = sp_2048_mod_exp_72(r, a, d, 2048, m, 0);
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin_72(r, out);
        *outLen = 256;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
#endif
    {
        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
        if (a != NULL)
            ForceZero(a, sizeof(sp_digit) * 72);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit d[72 * 4];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 256U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(dm) > 2048) {
            err = MP_READ_E;
        }
        else if (inLen > 256U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 2048) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 72 * 4, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d + 72;
        m = a + 144;
        r = a;

        sp_2048_from_bin(a, 72, in, inLen);
        sp_2048_from_mp(d, 72, dm);
        sp_2048_from_mp(m, 72, mm);
        err = sp_2048_mod_exp_72(r, a, d, 2048, m, 0);
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin_72(r, out);
        *outLen = 256;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
#endif
    {
        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
        if (a != NULL)
            ForceZero(a, sizeof(sp_digit) * 72);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#endif /* WOLFSSL_SP_SMALL */
#else
#if defined(WOLFSSL_SP_SMALL)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[36 * 8];
#endif
    sp_digit* p = NULL;
    sp_digit* dp = NULL;
    sp_digit* dq = NULL;
    sp_digit* qi = NULL;
    sp_digit* tmpa = NULL;
    sp_digit* tmpb = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 256U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (inLen > 256) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 2048) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(pm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(qm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 36 * 8, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif
    if (err == MP_OKAY) {
        p = a + 72;
        qi = dq = dp = p + 36;
        tmpa = qi + 36;
        tmpb = tmpa + 72;
        r = a;

        sp_2048_from_bin(a, 72, in, inLen);
        sp_2048_from_mp(p, 36, pm);
        sp_2048_from_mp(dp, 36, dpm);
        err = sp_2048_mod_exp_36(tmpa, a, dp, 1024, p, 1);
    }
    if (err == MP_OKAY) {
        sp_2048_from_mp(p, 36, qm);
        sp_2048_from_mp(dq, 36, dqm);
        err = sp_2048_mod_exp_36(tmpb, a, dq, 1024, p, 1);
    }
    if (err == MP_OKAY) {
        sp_2048_from_mp(p, 36, pm);
        (void)sp_2048_sub_36(tmpa, tmpa, tmpb);
        sp_2048_norm_36(tmpa);
        sp_2048_cond_add_36(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[35] >> 31));
        sp_2048_cond_add_36(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[35] >> 31));
        sp_2048_norm_36(tmpa);

        sp_2048_from_mp(qi, 36, qim);
        sp_2048_mul_36(tmpa, tmpa, qi);
        err = sp_2048_mod_36(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_2048_from_mp(p, 36, qm);
        sp_2048_mul_36(tmpa, p, tmpa);
        (void)sp_2048_add_72(r, tmpb, tmpa);
        sp_2048_norm_72(r);

        sp_2048_to_bin_72(r, out);
        *outLen = 256;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (a != NULL)
#endif
    {
        ForceZero(a, sizeof(sp_digit) * 36 * 8);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[36 * 13];
#endif
    sp_digit* p = NULL;
    sp_digit* q = NULL;
    sp_digit* dp = NULL;
    sp_digit* dq = NULL;
    sp_digit* qi = NULL;
    sp_digit* tmpa = NULL;
    sp_digit* tmpb = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 256U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (inLen > 256U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 2048) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(pm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(qm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 36 * 13, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        p = a + 72 * 2;
        q = p + 36;
        dp = q + 36;
        dq = dp + 36;
        qi = dq + 36;
        tmpa = qi + 36;
        tmpb = tmpa + 72;
        r = a;

        sp_2048_from_bin(a, 72, in, inLen);
        sp_2048_from_mp(p, 36, pm);
        sp_2048_from_mp(q, 36, qm);
        sp_2048_from_mp(dp, 36, dpm);
        sp_2048_from_mp(dq, 36, dqm);
        sp_2048_from_mp(qi, 36, qim);

        err = sp_2048_mod_exp_36(tmpa, a, dp, 1024, p, 1);
    }
    if (err == MP_OKAY) {
        err = sp_2048_mod_exp_36(tmpb, a, dq, 1024, q, 1);
    }

    if (err == MP_OKAY) {
        (void)sp_2048_sub_36(tmpa, tmpa, tmpb);
        sp_2048_norm_36(tmpa);
        sp_2048_cond_add_36(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[35] >> 31));
        sp_2048_cond_add_36(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[35] >> 31));
        sp_2048_norm_36(tmpa);
        sp_2048_mul_36(tmpa, tmpa, qi);
        err = sp_2048_mod_36(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_2048_mul_36(tmpa, tmpa, q);
        (void)sp_2048_add_72(r, tmpb, tmpa);
        sp_2048_norm_72(r);

        sp_2048_to_bin_72(r, out);
        *outLen = 256;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
if (a != NULL)
#endif
    {
        ForceZero(a, sizeof(sp_digit) * 36 * 13);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
    #endif
    }

    return err;
#endif /* WOLFSSL_SP_SMALL */
#endif /* SP_RSA_PRIVATE_EXP_D || RSA_LOW_MEM */
}

#endif /* !WOLFSSL_RSA_PUBLIC_ONLY */
#endif /* WOLFSSL_HAVE_SP_RSA */
#if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \
                                              !defined(WOLFSSL_RSA_PUBLIC_ONLY))
/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_2048_to_mp(const sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (2048 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
#if DIGIT_BIT == 29
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 71);
        r->used = 71;
        mp_clamp(r);
#elif DIGIT_BIT < 29
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 71; i++) {
            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 29) {
                s += DIGIT_BIT;
                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
                if (s == SP_WORD_SIZE) {
                    r->dp[j] = 0;
                }
                else {
                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 29 - s;
        }
        r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 71; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 29 >= DIGIT_BIT) {
    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 29 - s;
            }
            else {
                s += 29;
            }
        }
        r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_2048(const mp_int* base, const mp_int* exp, const mp_int* mod,
    mp_int* res)
{
#ifdef WOLFSSL_SP_SMALL
    int err = MP_OKAY;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[72 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 2048) {
        err = MP_READ_E;
    }
    else if (expBits > 2048) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 72 * 4, NULL,
            DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 72 * 2;
        m = e + 72;
        r = b;

        sp_2048_from_mp(b, 72, base);
        sp_2048_from_mp(e, 72, exp);
        sp_2048_from_mp(m, 72, mod);

        err = sp_2048_mod_exp_72(r, b, e, mp_count_bits(exp), m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_2048_to_mp(r, res);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 72U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }
    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[72 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 2048) {
        err = MP_READ_E;
    }
    else if (expBits > 2048) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 72 * 4, NULL, DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 72 * 2;
        m = e + 72;
        r = b;

        sp_2048_from_mp(b, 72, base);
        sp_2048_from_mp(e, 72, exp);
        sp_2048_from_mp(m, 72, mod);

        err = sp_2048_mod_exp_72(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_2048_to_mp(r, res);
    }


#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 72U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }

    return err;
#endif
}

#ifdef WOLFSSL_HAVE_SP_DH

#ifdef HAVE_FFDHE_2048
SP_NOINLINE static void sp_2048_lshift_72(sp_digit* r, const sp_digit* a,
        byte n)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    r[72] = a[71] >> (29 - n);
    for (i=71; i>0; i--) {
        r[i] = ((a[i] << n) | (a[i-1] >> (29 - n))) & 0x1fffffff;
    }
#else
    sp_int_digit s;
    sp_int_digit t;

    s = (sp_int_digit)a[71];
    r[72] = s >> (29U - n);
    s = (sp_int_digit)(a[71]); t = (sp_int_digit)(a[70]);
    r[71] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[70]); t = (sp_int_digit)(a[69]);
    r[70] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[69]); t = (sp_int_digit)(a[68]);
    r[69] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[68]); t = (sp_int_digit)(a[67]);
    r[68] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[67]); t = (sp_int_digit)(a[66]);
    r[67] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[66]); t = (sp_int_digit)(a[65]);
    r[66] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[65]); t = (sp_int_digit)(a[64]);
    r[65] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[64]); t = (sp_int_digit)(a[63]);
    r[64] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[63]); t = (sp_int_digit)(a[62]);
    r[63] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[62]); t = (sp_int_digit)(a[61]);
    r[62] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[61]); t = (sp_int_digit)(a[60]);
    r[61] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[60]); t = (sp_int_digit)(a[59]);
    r[60] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[59]); t = (sp_int_digit)(a[58]);
    r[59] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[58]); t = (sp_int_digit)(a[57]);
    r[58] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[57]); t = (sp_int_digit)(a[56]);
    r[57] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[56]); t = (sp_int_digit)(a[55]);
    r[56] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[55]); t = (sp_int_digit)(a[54]);
    r[55] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[54]); t = (sp_int_digit)(a[53]);
    r[54] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[53]); t = (sp_int_digit)(a[52]);
    r[53] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[52]); t = (sp_int_digit)(a[51]);
    r[52] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[51]); t = (sp_int_digit)(a[50]);
    r[51] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[50]); t = (sp_int_digit)(a[49]);
    r[50] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[49]); t = (sp_int_digit)(a[48]);
    r[49] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[48]); t = (sp_int_digit)(a[47]);
    r[48] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[47]); t = (sp_int_digit)(a[46]);
    r[47] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[46]); t = (sp_int_digit)(a[45]);
    r[46] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[45]); t = (sp_int_digit)(a[44]);
    r[45] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[44]); t = (sp_int_digit)(a[43]);
    r[44] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[43]); t = (sp_int_digit)(a[42]);
    r[43] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[42]); t = (sp_int_digit)(a[41]);
    r[42] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[41]); t = (sp_int_digit)(a[40]);
    r[41] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[40]); t = (sp_int_digit)(a[39]);
    r[40] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[39]); t = (sp_int_digit)(a[38]);
    r[39] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[38]); t = (sp_int_digit)(a[37]);
    r[38] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[37]); t = (sp_int_digit)(a[36]);
    r[37] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[36]); t = (sp_int_digit)(a[35]);
    r[36] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[35]); t = (sp_int_digit)(a[34]);
    r[35] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[34]); t = (sp_int_digit)(a[33]);
    r[34] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[33]); t = (sp_int_digit)(a[32]);
    r[33] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[32]); t = (sp_int_digit)(a[31]);
    r[32] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[31]); t = (sp_int_digit)(a[30]);
    r[31] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[30]); t = (sp_int_digit)(a[29]);
    r[30] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[29]); t = (sp_int_digit)(a[28]);
    r[29] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[28]); t = (sp_int_digit)(a[27]);
    r[28] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[27]); t = (sp_int_digit)(a[26]);
    r[27] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[26]); t = (sp_int_digit)(a[25]);
    r[26] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[25]); t = (sp_int_digit)(a[24]);
    r[25] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[24]); t = (sp_int_digit)(a[23]);
    r[24] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[23]); t = (sp_int_digit)(a[22]);
    r[23] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[22]); t = (sp_int_digit)(a[21]);
    r[22] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[21]); t = (sp_int_digit)(a[20]);
    r[21] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[20]); t = (sp_int_digit)(a[19]);
    r[20] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[19]); t = (sp_int_digit)(a[18]);
    r[19] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[18]); t = (sp_int_digit)(a[17]);
    r[18] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[17]); t = (sp_int_digit)(a[16]);
    r[17] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[16]); t = (sp_int_digit)(a[15]);
    r[16] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[15]); t = (sp_int_digit)(a[14]);
    r[15] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[14]); t = (sp_int_digit)(a[13]);
    r[14] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[13]); t = (sp_int_digit)(a[12]);
    r[13] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[12]); t = (sp_int_digit)(a[11]);
    r[12] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[11]); t = (sp_int_digit)(a[10]);
    r[11] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[10]); t = (sp_int_digit)(a[9]);
    r[10] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[9]); t = (sp_int_digit)(a[8]);
    r[9] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[8]); t = (sp_int_digit)(a[7]);
    r[8] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[7]); t = (sp_int_digit)(a[6]);
    r[7] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[6]); t = (sp_int_digit)(a[5]);
    r[6] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[5]); t = (sp_int_digit)(a[4]);
    r[5] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[4]); t = (sp_int_digit)(a[3]);
    r[4] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[3]); t = (sp_int_digit)(a[2]);
    r[3] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[2]); t = (sp_int_digit)(a[1]);
    r[2] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[1]); t = (sp_int_digit)(a[0]);
    r[1] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
#endif /* WOLFSSL_SP_SMALL */
    r[0] = (a[0] << n) & 0x1fffffff;
}

/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even.
 */
static int sp_2048_mod_exp_2_72(sp_digit* r, const sp_digit* e, int bits, const sp_digit* m)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[217];
#endif
    sp_digit* norm = NULL;
    sp_digit* tmp = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit o;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 217, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        tmp  = td + 144;
        XMEMSET(td, 0, sizeof(sp_digit) * 217);

        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_72(norm, m);

        bits = ((bits + 3) / 4) * 4;
        i = ((bits + 28) / 29) - 1;
        c = bits % 29;
        if (c == 0) {
            c = 29;
        }
        if (i < 72) {
            n = e[i--] << (32 - c);
        }
        else {
            n = 0;
            i--;
        }
        if (c < 4) {
            n |= e[i--] << (3 - c);
            c += 29;
        }
        y = (int)((n >> 28) & 0xf);
        n <<= 4;
        c -= 4;
        sp_2048_lshift_72(r, norm, (byte)y);
        while ((i >= 0) || (c >= 4)) {
            if (c >= 4) {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }
            else if (c == 0) {
                n = e[i--] << 3;
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c = 25;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n = e[i--] << 3;
                c = 4 - c;
                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
                n <<= c;
                c = 29 - c;
            }

            sp_2048_mont_sqr_72(r, r, m, mp);
            sp_2048_mont_sqr_72(r, r, m, mp);
            sp_2048_mont_sqr_72(r, r, m, mp);
            sp_2048_mont_sqr_72(r, r, m, mp);

            sp_2048_lshift_72(r, r, (byte)y);
            sp_2048_mul_d_72(tmp, norm, (r[71] << 11) + (r[70] >> 18));
            r[71] = 0;
            r[70] &= 0x3ffffL;
            (void)sp_2048_add_72(r, r, tmp);
            sp_2048_norm_72(r);
            o = sp_2048_cmp_72(r, m);
            sp_2048_cond_sub_72(r, r, m, ~(o >> 31));
        }

        sp_2048_mont_reduce_72(r, m, mp);
        n = sp_2048_cmp_72(r, m);
        sp_2048_cond_sub_72(r, r, m, ~(n >> 31));
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

#endif /* HAVE_FFDHE_2048 */

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base     Base.
 * exp      Array of bytes that is the exponent.
 * expLen   Length of data, in bytes, in exponent.
 * mod      Modulus.
 * out      Buffer to hold big-endian bytes of exponentiation result.
 *          Must be at least 256 bytes long.
 * outLen   Length, in bytes, of exponentiation result.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_DhExp_2048(const mp_int* base, const byte* exp, word32 expLen,
    const mp_int* mod, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[72 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    word32 i;
    int err = MP_OKAY;

    if (mp_count_bits(base) > 2048) {
        err = MP_READ_E;
    }
    else if (expLen > 256U) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 72 * 4, NULL,
            DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 72 * 2;
        m = e + 72;
        r = b;

        sp_2048_from_mp(b, 72, base);
        sp_2048_from_bin(e, 72, exp, expLen);
        sp_2048_from_mp(m, 72, mod);

    #ifdef HAVE_FFDHE_2048
        if (base->used == 1 && base->dp[0] == 2U &&
                (m[70] >> 2) == 0xffffL) {
            err = sp_2048_mod_exp_2_72(r, e, expLen * 8U, m);
        }
        else {
    #endif
            err = sp_2048_mod_exp_72(r, b, e, expLen * 8U, m, 0);
    #ifdef HAVE_FFDHE_2048
        }
    #endif
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin_72(r, out);
        *outLen = 256;
        for (i=0; i<256U && out[i] == 0U; i++) {
            /* Search for first non-zero. */
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 72U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }

    return err;
}
#endif /* WOLFSSL_HAVE_SP_DH */

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_1024(const mp_int* base, const mp_int* exp, const mp_int* mod,
    mp_int* res)
{
#ifdef WOLFSSL_SP_SMALL
    int err = MP_OKAY;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[36 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 1024) {
        err = MP_READ_E;
    }
    else if (expBits > 1024) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 1024) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 36 * 4, NULL,
            DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 36 * 2;
        m = e + 36;
        r = b;

        sp_2048_from_mp(b, 36, base);
        sp_2048_from_mp(e, 36, exp);
        sp_2048_from_mp(m, 36, mod);

        err = sp_2048_mod_exp_36(r, b, e, mp_count_bits(exp), m, 0);
    }

    if (err == MP_OKAY) {
        XMEMSET(r + 36, 0, sizeof(*r) * 36U);
        err = sp_2048_to_mp(r, res);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 72U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }
    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[36 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 1024) {
        err = MP_READ_E;
    }
    else if (expBits > 1024) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 1024) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 36 * 4, NULL, DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 36 * 2;
        m = e + 36;
        r = b;

        sp_2048_from_mp(b, 36, base);
        sp_2048_from_mp(e, 36, exp);
        sp_2048_from_mp(m, 36, mod);

        err = sp_2048_mod_exp_36(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        XMEMSET(r + 36, 0, sizeof(*r) * 36U);
        err = sp_2048_to_mp(r, res);
    }


#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 72U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }

    return err;
#endif
}

#endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */

#endif /* !WOLFSSL_SP_NO_2048 */

#ifndef WOLFSSL_SP_NO_3072
#ifdef WOLFSSL_SP_SMALL
/* Read big endian unsigned byte array into r.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_3072_from_bin(sp_digit* r, int size, const byte* a, int n)
{
    int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = n-1; i >= 0; i--) {
        r[j] |= (((sp_digit)a[i]) << s);
        if (s >= 21U) {
            r[j] &= 0x1fffffff;
            s = 29U - s;
            if (j + 1 >= size) {
                break;
            }
            r[++j] = (sp_digit)a[i] >> s;
            s = 8U - s;
        }
        else {
            s += 8U;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  A multi-precision integer.
 */
static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
{
#if DIGIT_BIT == 29
    int i;
    sp_digit j = (sp_digit)0 - (sp_digit)a->used;
    int o = 0;

    for (i = 0; i < size; i++) {
        sp_digit mask = (sp_digit)0 - (j >> 28);
        r[i] = a->dp[o] & mask;
        j++;
        o += (int)(j >> 28);
    }
#elif DIGIT_BIT > 29
    unsigned int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i] << s);
        r[j] &= 0x1fffffff;
        s = 29U - s;
        if (j + 1 >= size) {
            break;
        }
        /* lint allow cast of mismatch word32 and mp_digit */
        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
        while ((s + 29U) <= (word32)DIGIT_BIT) {
            s += 29U;
            r[j] &= 0x1fffffff;
            if (j + 1 >= size) {
                break;
            }
            if (s < (word32)DIGIT_BIT) {
                /* lint allow cast of mismatch word32 and mp_digit */
                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
            }
            else {
                r[++j] = (sp_digit)0;
            }
        }
        s = (word32)DIGIT_BIT - s;
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#else
    unsigned int i;
    int j = 0;
    int s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 29) {
            r[j] &= 0x1fffffff;
            if (j + 1 >= size) {
                break;
            }
            s = 29 - s;
            if (s == DIGIT_BIT) {
                r[++j] = 0;
                s = 0;
            }
            else {
                r[++j] = a->dp[i] >> s;
                s = DIGIT_BIT - s;
            }
        }
        else {
            s += DIGIT_BIT;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#endif
}

/* Write r as big endian to byte array.
 * Fixed length number of bytes written: 384
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_3072_to_bin_106(sp_digit* r, byte* a)
{
    int i;
    int j;
    int s = 0;
    int b;

    for (i=0; i<105; i++) {
        r[i+1] += r[i] >> 29;
        r[i] &= 0x1fffffff;
    }
    j = 3079 / 8 - 1;
    a[j] = 0;
    for (i=0; i<106 && j>=0; i++) {
        b = 0;
        /* lint allow cast of mismatch sp_digit and int */
        a[j--] |= (byte)(r[i] << s); /*lint !e9033*/
        b += 8 - s;
        if (j < 0) {
            break;
        }
        while (b < 29) {
            a[j--] = (byte)(r[i] >> b);
            b += 8;
            if (j < 0) {
                break;
            }
        }
        s = 8 - (b - 29);
        if (j >= 0) {
            a[j] = 0;
        }
        if (s != 0) {
            j++;
        }
    }
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
/* Normalize the values in each word to 29 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_3072_norm_53(sp_digit* a)
{
    int i;
    for (i = 0; i < 52; i++) {
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
}

#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
/* Normalize the values in each word to 29 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_3072_norm_106(sp_digit* a)
{
    int i;
    for (i = 0; i < 105; i++) {
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_3072_mul_106(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    int i;
    int imax;
    int k;
    sp_uint64 c;
    sp_uint64 lo;

    c = ((sp_uint64)a[105]) * b[105];
    r[211] = (sp_digit)(c >> 29);
    c &= 0x1fffffff;
    for (k = 209; k >= 0; k--) {
        if (k >= 106) {
            i = k - 105;
            imax = 105;
        }
        else {
            i = 0;
            imax = k;
        }
        if (imax - i > 15) {
            int imaxlo;
            lo = 0;
            for (imaxlo = i; imaxlo <= imax; imaxlo += 15) {
                for (; i <= imax && i < imaxlo + 15; i++) {
                    lo += ((sp_uint64)a[i]) * b[k - i];
                }
                c += lo >> 29;
                lo &= 0x1fffffff;
            }
            r[k + 2] += (sp_digit)(c >> 29);
            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
            c = lo & 0x1fffffff;
        }
        else {
            lo = 0;
            for (; i <= imax; i++) {
                lo += ((sp_uint64)a[i]) * b[k - i];
            }
            c += lo >> 29;
            r[k + 2] += (sp_digit)(c >> 29);
            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
            c = lo & 0x1fffffff;
        }
    }
    r[0] = (sp_digit)c;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_3072_sqr_106(sp_digit* r, const sp_digit* a)
{
    int i;
    int imax;
    int k;
    sp_uint64 c;
    sp_uint64 t;

    c = ((sp_uint64)a[105]) * a[105];
    r[211] = (sp_digit)(c >> 29);
    c = (c & 0x1fffffff) << 29;
    for (k = 209; k >= 0; k--) {
        i = (k + 1) / 2;
        if ((k & 1) == 0) {
           c += ((sp_uint64)a[i]) * a[i];
           i++;
        }
        if (k < 105) {
            imax = k;
        }
        else {
            imax = 105;
        }
        if (imax - i >= 14) {
            int imaxlo;
            sp_uint64 hi;

            hi = c >> 29;
            c &= 0x1fffffff;
            for (imaxlo = i; imaxlo <= imax; imaxlo += 14) {
                t = 0;
                for (; i <= imax && i < imaxlo + 14; i++) {
                    t += ((sp_uint64)a[i]) * a[k - i];
                }
                c += t * 2;

                hi += c >> 29;
                c &= 0x1fffffff;
            }
            r[k + 2] += (sp_digit)(hi >> 29);
            r[k + 1]  = (sp_digit)(hi & 0x1fffffff);
            c <<= 29;
        }
        else
        {
            t = 0;
            for (; i <= imax; i++) {
                t += ((sp_uint64)a[i]) * a[k - i];
            }
            c += t * 2;

            r[k + 2] += (sp_digit) (c >> 58);
            r[k + 1]  = (sp_digit)((c >> 29) & 0x1fffffff);
            c = (c & 0x1fffffff) << 29;
        }
    }
    r[0] = (sp_digit)(c >> 29);
}

/* Calculate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_3072_mont_setup(const sp_digit* a, sp_digit* rho)
{
    sp_digit x;
    sp_digit b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
    x &= 0x1fffffff;

    /* rho = -1/m mod b */
    *rho = ((sp_digit)1 << 29) - x;
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_3072_mul_d_106(sp_digit* r, const sp_digit* a,
    sp_digit b)
{
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 106; i++) {
        t += tb * a[i];
        r[i] = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
    }
    r[106] = (sp_digit)t;
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_3072_sub_53(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 53; i++) {
        r[i] = a[i] - b[i];
    }

    return 0;
}

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 3072 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_3072_mont_norm_53(sp_digit* r, const sp_digit* m)
{
    /* Set r = 2^n - 1. */
    int i;

    for (i=0; i<52; i++) {
        r[i] = 0x1fffffff;
    }
    r[52] = 0xfffffffL;

    /* r = (2^n - 1) mod n */
    (void)sp_3072_sub_53(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_3072_cmp_53(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
    int i;

    for (i=52; i>=0; i--) {
        r |= (a[i] - b[i]) & ~(((sp_digit)0 - r) >> 28);
    }

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_3072_cond_sub_53(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 53; i++) {
        r[i] = a[i] - (b[i] & m);
    }
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_3072_mul_add_53(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifndef WOLFSSL_SP_LARGE_CODE
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 53; i++) {
        t += r[i];
        t += tb * a[i];
        r[i] = ((sp_digit)t) & 0x1fffffff;
        t >>= 29;
    }
    r[53] += (sp_digit)t;
#else
    sp_int64 tb = b;
    sp_int64 t[4];
    int i;

    t[0] = 0;
    for (i = 0; i < 52; i += 4) {
        t[0] += (tb * a[i+0]) + r[i+0];
        t[1]  = (tb * a[i+1]) + r[i+1];
        t[2]  = (tb * a[i+2]) + r[i+2];
        t[3]  = (tb * a[i+3]) + r[i+3];
        r[i+0] = t[0] & 0x1fffffff;
        t[1] += t[0] >> 29;
        r[i+1] = t[1] & 0x1fffffff;
        t[2] += t[1] >> 29;
        r[i+2] = t[2] & 0x1fffffff;
        t[3] += t[2] >> 29;
        r[i+3] = t[3] & 0x1fffffff;
        t[0]  = t[3] >> 29;
    }
    t[0] += (tb * a[52]) + r[52];
    r[52] = t[0] & 0x1fffffff;
    r[53] +=  (sp_digit)(t[0] >> 29);
#endif /* !WOLFSSL_SP_LARGE_CODE */
}

/* Shift the result in the high 1536 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_3072_mont_shift_53(sp_digit* r, const sp_digit* a)
{
    int i;
    sp_int64 n = a[52] >> 28;
    n += ((sp_int64)a[53]) << 1;

    for (i = 0; i < 52; i++) {
        r[i] = n & 0x1fffffff;
        n >>= 29;
        n += ((sp_int64)a[54 + i]) << 1;
    }
    r[52] = (sp_digit)n;
    XMEMSET(&r[53], 0, sizeof(*r) * 53U);
}

/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_3072_mont_reduce_53(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;
    sp_digit over;

    sp_3072_norm_53(a + 53);

    for (i=0; i<52; i++) {
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x1fffffff;
        sp_3072_mul_add_53(a+i, m, mu);
        a[i+1] += a[i] >> 29;
    }
    mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0xfffffffL;
    sp_3072_mul_add_53(a+i, m, mu);
    a[i+1] += a[i] >> 29;
    a[i] &= 0x1fffffff;
    sp_3072_mont_shift_53(a, a);
    over = a[52] - m[52];
    sp_3072_cond_sub_53(a, a, m, ~((over - 1) >> 31));
    sp_3072_norm_53(a);
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_3072_mul_53(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    int i;
    int imax;
    int k;
    sp_uint64 c;
    sp_uint64 lo;

    c = ((sp_uint64)a[52]) * b[52];
    r[105] = (sp_digit)(c >> 29);
    c &= 0x1fffffff;
    for (k = 103; k >= 0; k--) {
        if (k >= 53) {
            i = k - 52;
            imax = 52;
        }
        else {
            i = 0;
            imax = k;
        }
        if (imax - i > 15) {
            int imaxlo;
            lo = 0;
            for (imaxlo = i; imaxlo <= imax; imaxlo += 15) {
                for (; i <= imax && i < imaxlo + 15; i++) {
                    lo += ((sp_uint64)a[i]) * b[k - i];
                }
                c += lo >> 29;
                lo &= 0x1fffffff;
            }
            r[k + 2] += (sp_digit)(c >> 29);
            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
            c = lo & 0x1fffffff;
        }
        else {
            lo = 0;
            for (; i <= imax; i++) {
                lo += ((sp_uint64)a[i]) * b[k - i];
            }
            c += lo >> 29;
            r[k + 2] += (sp_digit)(c >> 29);
            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
            c = lo & 0x1fffffff;
        }
    }
    r[0] = (sp_digit)c;
}

/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_3072_mont_mul_53(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_3072_mul_53(r, a, b);
    sp_3072_mont_reduce_53(r, m, mp);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_3072_sqr_53(sp_digit* r, const sp_digit* a)
{
    int i;
    int imax;
    int k;
    sp_uint64 c;
    sp_uint64 t;

    c = ((sp_uint64)a[52]) * a[52];
    r[105] = (sp_digit)(c >> 29);
    c = (c & 0x1fffffff) << 29;
    for (k = 103; k >= 0; k--) {
        i = (k + 1) / 2;
        if ((k & 1) == 0) {
           c += ((sp_uint64)a[i]) * a[i];
           i++;
        }
        if (k < 52) {
            imax = k;
        }
        else {
            imax = 52;
        }
        if (imax - i >= 14) {
            int imaxlo;
            sp_uint64 hi;

            hi = c >> 29;
            c &= 0x1fffffff;
            for (imaxlo = i; imaxlo <= imax; imaxlo += 14) {
                t = 0;
                for (; i <= imax && i < imaxlo + 14; i++) {
                    t += ((sp_uint64)a[i]) * a[k - i];
                }
                c += t * 2;

                hi += c >> 29;
                c &= 0x1fffffff;
            }
            r[k + 2] += (sp_digit)(hi >> 29);
            r[k + 1]  = (sp_digit)(hi & 0x1fffffff);
            c <<= 29;
        }
        else
        {
            t = 0;
            for (; i <= imax; i++) {
                t += ((sp_uint64)a[i]) * a[k - i];
            }
            c += t * 2;

            r[k + 2] += (sp_digit) (c >> 58);
            r[k + 1]  = (sp_digit)((c >> 29) & 0x1fffffff);
            c = (c & 0x1fffffff) << 29;
        }
    }
    r[0] = (sp_digit)(c >> 29);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_3072_mont_sqr_53(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_3072_sqr_53(r, a);
    sp_3072_mont_reduce_53(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_3072_mul_d_53(sp_digit* r, const sp_digit* a,
    sp_digit b)
{
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 53; i++) {
        t += tb * a[i];
        r[i] = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
    }
    r[53] = (sp_digit)t;
}

#ifdef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_3072_cond_add_53(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 53; i++) {
        r[i] = a[i] + (b[i] & m);
    }
}
#endif /* WOLFSSL_SP_SMALL */

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_3072_add_53(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 53; i++) {
        r[i] = a[i] + b[i];
    }

    return 0;
}

SP_NOINLINE static void sp_3072_rshift_53(sp_digit* r, const sp_digit* a,
        byte n)
{
    int i;

    for (i=0; i<52; i++) {
        r[i] = ((a[i] >> n) | (a[i + 1] << (29 - n))) & 0x1fffffff;
    }
    r[52] = a[52] >> n;
}

static WC_INLINE sp_digit sp_3072_div_word_53(sp_digit d1, sp_digit d0,
    sp_digit div)
{
#ifdef SP_USE_DIVTI3
    sp_int64 d = ((sp_int64)d1 << 29) + d0;

    return d / div;
#elif defined(__x86_64__) || defined(__i386__)
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_uint32 lo = (sp_uint32)d;
    sp_digit hi = (sp_digit)(d >> 32);

    __asm__ __volatile__ (
        "idiv %2"
        : "+a" (lo)
        : "d" (hi), "r" (div)
        : "cc"
    );

    return (sp_digit)lo;
#elif !defined(__aarch64__) &&  !defined(SP_DIV_WORD_USE_DIV)
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_digit dv = (div >> 1) + 1;
    sp_digit t1 = (sp_digit)(d >> 29);
    sp_digit t0 = (sp_digit)(d & 0x1fffffff);
    sp_digit t2;
    sp_digit sign;
    sp_digit r;
    int i;
    sp_int64 m;

    r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
    t1 -= dv & (0 - r);
    for (i = 27; i >= 1; i--) {
        t1 += t1 + (((sp_uint32)t0 >> 28) & 1);
        t0 <<= 1;
        t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
        r += r + t2;
        t1 -= dv & (0 - t2);
        t1 += t2;
    }
    r += r + 1;

    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 29);
    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 58) - (sp_digit)(d >> 58);

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;
   return r;
#else
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_digit r = 0;
    sp_digit t;
    sp_digit dv = (div >> 14) + 1;

    t = (sp_digit)(d >> 28);
    t = (t / dv) << 14;
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)(d >> 13);
    t = t / (dv << 1);
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)d;
    t = t / div;
    r += t;
    d -= (sp_int64)t * div;
    return r;
#endif
}
static WC_INLINE sp_digit sp_3072_word_div_word_53(sp_digit d, sp_digit div)
{
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \
    defined(SP_DIV_WORD_USE_DIV)
    return d / div;
#else
    return (sp_digit)((sp_uint32)(div - d) >> 31);
#endif
}
/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * Full implementation.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_3072_div_53(const sp_digit* a, const sp_digit* d,
        const sp_digit* m, sp_digit* r)
{
    int i;
#ifndef WOLFSSL_SP_DIV_32
#endif
    sp_digit dv;
    sp_digit r1;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* t1 = NULL;
#else
    sp_digit t1[4 * 53 + 3];
#endif
    sp_digit* t2 = NULL;
    sp_digit* sd = NULL;
    int err = MP_OKAY;

    (void)m;

#ifdef WOLFSSL_SP_SMALL_STACK
    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 53 + 3), NULL,
                                                       DYNAMIC_TYPE_TMP_BUFFER);
    if (t1 == NULL)
        err = MEMORY_E;
#endif

    (void)m;

    if (err == MP_OKAY) {
        t2 = t1 + 106 + 1;
        sd = t2 + 53 + 1;

        sp_3072_mul_d_53(sd, d, (sp_digit)1 << 1);
        sp_3072_mul_d_106(t1, a, (sp_digit)1 << 1);
        dv = sd[52];
        t1[53 + 53] += t1[53 + 53 - 1] >> 29;
        t1[53 + 53 - 1] &= 0x1fffffff;
        for (i=53; i>=0; i--) {
            r1 = sp_3072_div_word_53(t1[53 + i], t1[53 + i - 1], dv);

            sp_3072_mul_d_53(t2, sd, r1);
            (void)sp_3072_sub_53(&t1[i], &t1[i], t2);
            sp_3072_norm_53(&t1[i]);
            t1[53 + i] -= t2[53];
            t1[53 + i] += t1[53 + i - 1] >> 29;
            t1[53 + i - 1] &= 0x1fffffff;
            r1 = sp_3072_div_word_53(-t1[53 + i], -t1[53 + i - 1], dv);
            r1 -= t1[53 + i];
            sp_3072_mul_d_53(t2, sd, r1);
            (void)sp_3072_add_53(&t1[i], &t1[i], t2);
            t1[53 + i] += t1[53 + i - 1] >> 29;
            t1[53 + i - 1] &= 0x1fffffff;
        }
        t1[53 - 1] += t1[53 - 2] >> 29;
        t1[53 - 2] &= 0x1fffffff;
        r1 = sp_3072_word_div_word_53(t1[53 - 1], dv);

        sp_3072_mul_d_53(t2, sd, r1);
        sp_3072_sub_53(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 106U);
        for (i=0; i<52; i++) {
            r[i+1] += r[i] >> 29;
            r[i] &= 0x1fffffff;
        }
        sp_3072_cond_add_53(r, r, sd, r[52] >> 31);

        sp_3072_norm_53(r);
        sp_3072_rshift_53(r, r, 1);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t1 != NULL)
        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_3072_mod_53(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_3072_div_53(a, m, NULL, r);
}

/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_3072_mod_exp_53(sp_digit* r, const sp_digit* a, const sp_digit* e,
    int bits, const sp_digit* m, int reduceA)
{
#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 106];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 53 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 53 * 2);
            XMEMSET(t[i], 0, sizeof(sp_digit) * 53U * 2U);
        }

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_53(norm, m);

        if (reduceA != 0) {
            err = sp_3072_mod_53(t[1], a, m);
        }
        else {
            XMEMCPY(t[1], a, sizeof(sp_digit) * 53U);
        }
    }
    if (err == MP_OKAY) {
        sp_3072_mul_53(t[1], t[1], norm);
        err = sp_3072_mod_53(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 29;
        c = bits % 29;
        n = e[i--] << (29 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 29;
            }

            y = (int)((n >> 28) & 1);
            n <<= 1;

            sp_3072_mont_mul_53(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 53 * 2);
            sp_3072_mont_sqr_53(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 53 * 2);
        }

        sp_3072_mont_reduce_53(t[0], m, mp);
        n = sp_3072_cmp_53(t[0], m);
        sp_3072_cond_sub_53(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 53 * 2);

    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#elif !defined(WC_NO_CACHE_RESISTANT)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 106];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 53 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 53 * 2);
        }

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_53(norm, m);

        if (reduceA != 0) {
            err = sp_3072_mod_53(t[1], a, m);
            if (err == MP_OKAY) {
                sp_3072_mul_53(t[1], t[1], norm);
                err = sp_3072_mod_53(t[1], t[1], m);
            }
        }
        else {
            sp_3072_mul_53(t[1], a, norm);
            err = sp_3072_mod_53(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 29;
        c = bits % 29;
        n = e[i--] << (29 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 29;
            }

            y = (int)((n >> 28) & 1);
            n <<= 1;

            sp_3072_mont_mul_53(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 53 * 2);
            sp_3072_mont_sqr_53(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 53 * 2);
        }

        sp_3072_mont_reduce_53(t[0], m, mp);
        n = sp_3072_cmp_53(t[0], m);
        sp_3072_cond_sub_53(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 53 * 2);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[(32 * 106) + 106];
#endif
    sp_digit* t[32];
    sp_digit* rt = NULL;
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 106) + 106), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<32; i++)
            t[i] = td + i * 106;
        rt = td + 3392;

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_53(norm, m);

        if (reduceA != 0) {
            err = sp_3072_mod_53(t[1], a, m);
            if (err == MP_OKAY) {
                sp_3072_mul_53(t[1], t[1], norm);
                err = sp_3072_mod_53(t[1], t[1], m);
            }
        }
        else {
            sp_3072_mul_53(t[1], a, norm);
            err = sp_3072_mod_53(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_3072_mont_sqr_53(t[ 2], t[ 1], m, mp);
        sp_3072_mont_mul_53(t[ 3], t[ 2], t[ 1], m, mp);
        sp_3072_mont_sqr_53(t[ 4], t[ 2], m, mp);
        sp_3072_mont_mul_53(t[ 5], t[ 3], t[ 2], m, mp);
        sp_3072_mont_sqr_53(t[ 6], t[ 3], m, mp);
        sp_3072_mont_mul_53(t[ 7], t[ 4], t[ 3], m, mp);
        sp_3072_mont_sqr_53(t[ 8], t[ 4], m, mp);
        sp_3072_mont_mul_53(t[ 9], t[ 5], t[ 4], m, mp);
        sp_3072_mont_sqr_53(t[10], t[ 5], m, mp);
        sp_3072_mont_mul_53(t[11], t[ 6], t[ 5], m, mp);
        sp_3072_mont_sqr_53(t[12], t[ 6], m, mp);
        sp_3072_mont_mul_53(t[13], t[ 7], t[ 6], m, mp);
        sp_3072_mont_sqr_53(t[14], t[ 7], m, mp);
        sp_3072_mont_mul_53(t[15], t[ 8], t[ 7], m, mp);
        sp_3072_mont_sqr_53(t[16], t[ 8], m, mp);
        sp_3072_mont_mul_53(t[17], t[ 9], t[ 8], m, mp);
        sp_3072_mont_sqr_53(t[18], t[ 9], m, mp);
        sp_3072_mont_mul_53(t[19], t[10], t[ 9], m, mp);
        sp_3072_mont_sqr_53(t[20], t[10], m, mp);
        sp_3072_mont_mul_53(t[21], t[11], t[10], m, mp);
        sp_3072_mont_sqr_53(t[22], t[11], m, mp);
        sp_3072_mont_mul_53(t[23], t[12], t[11], m, mp);
        sp_3072_mont_sqr_53(t[24], t[12], m, mp);
        sp_3072_mont_mul_53(t[25], t[13], t[12], m, mp);
        sp_3072_mont_sqr_53(t[26], t[13], m, mp);
        sp_3072_mont_mul_53(t[27], t[14], t[13], m, mp);
        sp_3072_mont_sqr_53(t[28], t[14], m, mp);
        sp_3072_mont_mul_53(t[29], t[15], t[14], m, mp);
        sp_3072_mont_sqr_53(t[30], t[15], m, mp);
        sp_3072_mont_mul_53(t[31], t[16], t[15], m, mp);

        bits = ((bits + 4) / 5) * 5;
        i = ((bits + 28) / 29) - 1;
        c = bits % 29;
        if (c == 0) {
            c = 29;
        }
        if (i < 53) {
            n = e[i--] << (32 - c);
        }
        else {
            n = 0;
            i--;
        }
        if (c < 5) {
            n |= e[i--] << (3 - c);
            c += 29;
        }
        y = (int)((n >> 27) & 0x1f);
        n <<= 5;
        c -= 5;
        XMEMCPY(rt, t[y], sizeof(sp_digit) * 106);
        while ((i >= 0) || (c >= 5)) {
            if (c >= 5) {
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c -= 5;
            }
            else if (c == 0) {
                n = e[i--] << 3;
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c = 24;
            }
            else {
                y = (byte)((n >> 27) & 0x1f);
                n = e[i--] << 3;
                c = 5 - c;
                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
                n <<= c;
                c = 29 - c;
            }

            sp_3072_mont_sqr_53(rt, rt, m, mp);
            sp_3072_mont_sqr_53(rt, rt, m, mp);
            sp_3072_mont_sqr_53(rt, rt, m, mp);
            sp_3072_mont_sqr_53(rt, rt, m, mp);
            sp_3072_mont_sqr_53(rt, rt, m, mp);

            sp_3072_mont_mul_53(rt, rt, t[y], m, mp);
        }

        sp_3072_mont_reduce_53(rt, m, mp);
        n = sp_3072_cmp_53(rt, m);
        sp_3072_cond_sub_53(rt, rt, m, ~(n >> 31));
        XMEMCPY(r, rt, sizeof(sp_digit) * 106);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}

#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_3072_sub_106(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 106; i++) {
        r[i] = a[i] - b[i];
    }

    return 0;
}

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 3072 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_3072_mont_norm_106(sp_digit* r, const sp_digit* m)
{
    /* Set r = 2^n - 1. */
    int i;

    for (i=0; i<105; i++) {
        r[i] = 0x1fffffff;
    }
    r[105] = 0x7ffffffL;

    /* r = (2^n - 1) mod n */
    (void)sp_3072_sub_106(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_3072_cmp_106(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
    int i;

    for (i=105; i>=0; i--) {
        r |= (a[i] - b[i]) & ~(((sp_digit)0 - r) >> 28);
    }

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_3072_cond_sub_106(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 106; i++) {
        r[i] = a[i] - (b[i] & m);
    }
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_3072_mul_add_106(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifndef WOLFSSL_SP_LARGE_CODE
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 106; i++) {
        t += r[i];
        t += tb * a[i];
        r[i] = ((sp_digit)t) & 0x1fffffff;
        t >>= 29;
    }
    r[106] += (sp_digit)t;
#else
    sp_int64 tb = b;
    sp_int64 t[4];
    int i;

    t[0] = 0;
    for (i = 0; i < 104; i += 4) {
        t[0] += (tb * a[i+0]) + r[i+0];
        t[1]  = (tb * a[i+1]) + r[i+1];
        t[2]  = (tb * a[i+2]) + r[i+2];
        t[3]  = (tb * a[i+3]) + r[i+3];
        r[i+0] = t[0] & 0x1fffffff;
        t[1] += t[0] >> 29;
        r[i+1] = t[1] & 0x1fffffff;
        t[2] += t[1] >> 29;
        r[i+2] = t[2] & 0x1fffffff;
        t[3] += t[2] >> 29;
        r[i+3] = t[3] & 0x1fffffff;
        t[0]  = t[3] >> 29;
    }
    t[0] += (tb * a[104]) + r[104];
    t[1]  = (tb * a[105]) + r[105];
    r[104] = t[0] & 0x1fffffff;
    t[1] += t[0] >> 29;
    r[105] = t[1] & 0x1fffffff;
    r[106] +=  (sp_digit)(t[1] >> 29);
#endif /* !WOLFSSL_SP_LARGE_CODE */
}

/* Shift the result in the high 3072 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_3072_mont_shift_106(sp_digit* r, const sp_digit* a)
{
    int i;
    sp_int64 n = a[105] >> 27;
    n += ((sp_int64)a[106]) << 2;

    for (i = 0; i < 105; i++) {
        r[i] = n & 0x1fffffff;
        n >>= 29;
        n += ((sp_int64)a[107 + i]) << 2;
    }
    r[105] = (sp_digit)n;
    XMEMSET(&r[106], 0, sizeof(*r) * 106U);
}

/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_3072_mont_reduce_106(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;
    sp_digit over;

    sp_3072_norm_106(a + 106);

#ifdef WOLFSSL_SP_DH
    if (mp != 1) {
        for (i=0; i<105; i++) {
            mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x1fffffff;
            sp_3072_mul_add_106(a+i, m, mu);
            a[i+1] += a[i] >> 29;
        }
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x7ffffffL;
        sp_3072_mul_add_106(a+i, m, mu);
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
    else {
        for (i=0; i<105; i++) {
            mu = a[i] & 0x1fffffff;
            sp_3072_mul_add_106(a+i, m, mu);
            a[i+1] += a[i] >> 29;
        }
        mu = a[i] & 0x7ffffffL;
        sp_3072_mul_add_106(a+i, m, mu);
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
#else
    for (i=0; i<105; i++) {
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x1fffffff;
        sp_3072_mul_add_106(a+i, m, mu);
        a[i+1] += a[i] >> 29;
    }
    mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x7ffffffL;
    sp_3072_mul_add_106(a+i, m, mu);
    a[i+1] += a[i] >> 29;
    a[i] &= 0x1fffffff;
#endif
    sp_3072_mont_shift_106(a, a);
    over = a[105] - m[105];
    sp_3072_cond_sub_106(a, a, m, ~((over - 1) >> 31));
    sp_3072_norm_106(a);
}

/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_3072_mont_mul_106(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_3072_mul_106(r, a, b);
    sp_3072_mont_reduce_106(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_3072_mont_sqr_106(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_3072_sqr_106(r, a);
    sp_3072_mont_reduce_106(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_3072_mul_d_212(sp_digit* r, const sp_digit* a,
    sp_digit b)
{
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 212; i++) {
        t += tb * a[i];
        r[i] = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
    }
    r[212] = (sp_digit)t;
}

#ifdef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_3072_cond_add_106(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 106; i++) {
        r[i] = a[i] + (b[i] & m);
    }
}
#endif /* WOLFSSL_SP_SMALL */

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_3072_add_106(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 106; i++) {
        r[i] = a[i] + b[i];
    }

    return 0;
}

SP_NOINLINE static void sp_3072_rshift_106(sp_digit* r, const sp_digit* a,
        byte n)
{
    int i;

    for (i=0; i<105; i++) {
        r[i] = ((a[i] >> n) | (a[i + 1] << (29 - n))) & 0x1fffffff;
    }
    r[105] = a[105] >> n;
}

static WC_INLINE sp_digit sp_3072_div_word_106(sp_digit d1, sp_digit d0,
    sp_digit div)
{
#ifdef SP_USE_DIVTI3
    sp_int64 d = ((sp_int64)d1 << 29) + d0;

    return d / div;
#elif defined(__x86_64__) || defined(__i386__)
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_uint32 lo = (sp_uint32)d;
    sp_digit hi = (sp_digit)(d >> 32);

    __asm__ __volatile__ (
        "idiv %2"
        : "+a" (lo)
        : "d" (hi), "r" (div)
        : "cc"
    );

    return (sp_digit)lo;
#elif !defined(__aarch64__) &&  !defined(SP_DIV_WORD_USE_DIV)
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_digit dv = (div >> 1) + 1;
    sp_digit t1 = (sp_digit)(d >> 29);
    sp_digit t0 = (sp_digit)(d & 0x1fffffff);
    sp_digit t2;
    sp_digit sign;
    sp_digit r;
    int i;
    sp_int64 m;

    r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
    t1 -= dv & (0 - r);
    for (i = 27; i >= 1; i--) {
        t1 += t1 + (((sp_uint32)t0 >> 28) & 1);
        t0 <<= 1;
        t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
        r += r + t2;
        t1 -= dv & (0 - t2);
        t1 += t2;
    }
    r += r + 1;

    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 29);
    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 58) - (sp_digit)(d >> 58);

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;
   return r;
#else
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_digit r = 0;
    sp_digit t;
    sp_digit dv = (div >> 14) + 1;

    t = (sp_digit)(d >> 28);
    t = (t / dv) << 14;
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)(d >> 13);
    t = t / (dv << 1);
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)d;
    t = t / div;
    r += t;
    d -= (sp_int64)t * div;
    return r;
#endif
}
static WC_INLINE sp_digit sp_3072_word_div_word_106(sp_digit d, sp_digit div)
{
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \
    defined(SP_DIV_WORD_USE_DIV)
    return d / div;
#else
    return (sp_digit)((sp_uint32)(div - d) >> 31);
#endif
}
/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * Full implementation.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_3072_div_106(const sp_digit* a, const sp_digit* d,
        const sp_digit* m, sp_digit* r)
{
    int i;
#ifndef WOLFSSL_SP_DIV_32
#endif
    sp_digit dv;
    sp_digit r1;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* t1 = NULL;
#else
    sp_digit t1[4 * 106 + 3];
#endif
    sp_digit* t2 = NULL;
    sp_digit* sd = NULL;
    int err = MP_OKAY;

    (void)m;

#ifdef WOLFSSL_SP_SMALL_STACK
    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 106 + 3), NULL,
                                                       DYNAMIC_TYPE_TMP_BUFFER);
    if (t1 == NULL)
        err = MEMORY_E;
#endif

    (void)m;

    if (err == MP_OKAY) {
        t2 = t1 + 212 + 1;
        sd = t2 + 106 + 1;

        sp_3072_mul_d_106(sd, d, (sp_digit)1 << 2);
        sp_3072_mul_d_212(t1, a, (sp_digit)1 << 2);
        dv = sd[105];
        t1[106 + 106] += t1[106 + 106 - 1] >> 29;
        t1[106 + 106 - 1] &= 0x1fffffff;
        for (i=106; i>=0; i--) {
            r1 = sp_3072_div_word_106(t1[106 + i], t1[106 + i - 1], dv);

            sp_3072_mul_d_106(t2, sd, r1);
            (void)sp_3072_sub_106(&t1[i], &t1[i], t2);
            sp_3072_norm_106(&t1[i]);
            t1[106 + i] -= t2[106];
            t1[106 + i] += t1[106 + i - 1] >> 29;
            t1[106 + i - 1] &= 0x1fffffff;
            r1 = sp_3072_div_word_106(-t1[106 + i], -t1[106 + i - 1], dv);
            r1 -= t1[106 + i];
            sp_3072_mul_d_106(t2, sd, r1);
            (void)sp_3072_add_106(&t1[i], &t1[i], t2);
            t1[106 + i] += t1[106 + i - 1] >> 29;
            t1[106 + i - 1] &= 0x1fffffff;
        }
        t1[106 - 1] += t1[106 - 2] >> 29;
        t1[106 - 2] &= 0x1fffffff;
        r1 = sp_3072_word_div_word_106(t1[106 - 1], dv);

        sp_3072_mul_d_106(t2, sd, r1);
        sp_3072_sub_106(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 212U);
        for (i=0; i<105; i++) {
            r[i+1] += r[i] >> 29;
            r[i] &= 0x1fffffff;
        }
        sp_3072_cond_add_106(r, r, sd, r[105] >> 31);

        sp_3072_norm_106(r);
        sp_3072_rshift_106(r, r, 2);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t1 != NULL)
        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_3072_mod_106(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_3072_div_106(a, m, NULL, r);
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_3072_mod_exp_106(sp_digit* r, const sp_digit* a, const sp_digit* e,
    int bits, const sp_digit* m, int reduceA)
{
#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 212];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 106 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 106 * 2);
            XMEMSET(t[i], 0, sizeof(sp_digit) * 106U * 2U);
        }

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_106(norm, m);

        if (reduceA != 0) {
            err = sp_3072_mod_106(t[1], a, m);
        }
        else {
            XMEMCPY(t[1], a, sizeof(sp_digit) * 106U);
        }
    }
    if (err == MP_OKAY) {
        sp_3072_mul_106(t[1], t[1], norm);
        err = sp_3072_mod_106(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 29;
        c = bits % 29;
        n = e[i--] << (29 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 29;
            }

            y = (int)((n >> 28) & 1);
            n <<= 1;

            sp_3072_mont_mul_106(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 106 * 2);
            sp_3072_mont_sqr_106(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 106 * 2);
        }

        sp_3072_mont_reduce_106(t[0], m, mp);
        n = sp_3072_cmp_106(t[0], m);
        sp_3072_cond_sub_106(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 106 * 2);

    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#elif !defined(WC_NO_CACHE_RESISTANT)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 212];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 106 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 106 * 2);
        }

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_106(norm, m);

        if (reduceA != 0) {
            err = sp_3072_mod_106(t[1], a, m);
            if (err == MP_OKAY) {
                sp_3072_mul_106(t[1], t[1], norm);
                err = sp_3072_mod_106(t[1], t[1], m);
            }
        }
        else {
            sp_3072_mul_106(t[1], a, norm);
            err = sp_3072_mod_106(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 29;
        c = bits % 29;
        n = e[i--] << (29 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 29;
            }

            y = (int)((n >> 28) & 1);
            n <<= 1;

            sp_3072_mont_mul_106(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 106 * 2);
            sp_3072_mont_sqr_106(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 106 * 2);
        }

        sp_3072_mont_reduce_106(t[0], m, mp);
        n = sp_3072_cmp_106(t[0], m);
        sp_3072_cond_sub_106(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 106 * 2);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[(16 * 212) + 212];
#endif
    sp_digit* t[16];
    sp_digit* rt = NULL;
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((16 * 212) + 212), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<16; i++)
            t[i] = td + i * 212;
        rt = td + 3392;

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_106(norm, m);

        if (reduceA != 0) {
            err = sp_3072_mod_106(t[1], a, m);
            if (err == MP_OKAY) {
                sp_3072_mul_106(t[1], t[1], norm);
                err = sp_3072_mod_106(t[1], t[1], m);
            }
        }
        else {
            sp_3072_mul_106(t[1], a, norm);
            err = sp_3072_mod_106(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_3072_mont_sqr_106(t[ 2], t[ 1], m, mp);
        sp_3072_mont_mul_106(t[ 3], t[ 2], t[ 1], m, mp);
        sp_3072_mont_sqr_106(t[ 4], t[ 2], m, mp);
        sp_3072_mont_mul_106(t[ 5], t[ 3], t[ 2], m, mp);
        sp_3072_mont_sqr_106(t[ 6], t[ 3], m, mp);
        sp_3072_mont_mul_106(t[ 7], t[ 4], t[ 3], m, mp);
        sp_3072_mont_sqr_106(t[ 8], t[ 4], m, mp);
        sp_3072_mont_mul_106(t[ 9], t[ 5], t[ 4], m, mp);
        sp_3072_mont_sqr_106(t[10], t[ 5], m, mp);
        sp_3072_mont_mul_106(t[11], t[ 6], t[ 5], m, mp);
        sp_3072_mont_sqr_106(t[12], t[ 6], m, mp);
        sp_3072_mont_mul_106(t[13], t[ 7], t[ 6], m, mp);
        sp_3072_mont_sqr_106(t[14], t[ 7], m, mp);
        sp_3072_mont_mul_106(t[15], t[ 8], t[ 7], m, mp);

        bits = ((bits + 3) / 4) * 4;
        i = ((bits + 28) / 29) - 1;
        c = bits % 29;
        if (c == 0) {
            c = 29;
        }
        if (i < 106) {
            n = e[i--] << (32 - c);
        }
        else {
            n = 0;
            i--;
        }
        if (c < 4) {
            n |= e[i--] << (3 - c);
            c += 29;
        }
        y = (int)((n >> 28) & 0xf);
        n <<= 4;
        c -= 4;
        XMEMCPY(rt, t[y], sizeof(sp_digit) * 212);
        while ((i >= 0) || (c >= 4)) {
            if (c >= 4) {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }
            else if (c == 0) {
                n = e[i--] << 3;
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c = 25;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n = e[i--] << 3;
                c = 4 - c;
                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
                n <<= c;
                c = 29 - c;
            }

            sp_3072_mont_sqr_106(rt, rt, m, mp);
            sp_3072_mont_sqr_106(rt, rt, m, mp);
            sp_3072_mont_sqr_106(rt, rt, m, mp);
            sp_3072_mont_sqr_106(rt, rt, m, mp);

            sp_3072_mont_mul_106(rt, rt, t[y], m, mp);
        }

        sp_3072_mont_reduce_106(rt, m, mp);
        n = sp_3072_cmp_106(rt, m);
        sp_3072_cond_sub_106(rt, rt, m, ~(n >> 31));
        XMEMCPY(r, rt, sizeof(sp_digit) * 212);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}

#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
#ifdef WOLFSSL_HAVE_SP_RSA
/* RSA public key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * em      Public exponent.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 384 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
    const mp_int* mm, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[106 * 5];
#endif
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    sp_digit* norm = NULL;
    sp_uint64 e[1] = {0};
    sp_digit mp = 0;
    int i;
    int err = MP_OKAY;

    if (*outLen < 384U) {
        err = MP_TO_E;
    }

    if (err == MP_OKAY) {
        if (mp_count_bits(em) > 64) {
            err = MP_READ_E;
        }
        else if (inLen > 384U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 3072) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 106 * 5, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        r = a + 106 * 2;
        m = r + 106 * 2;
        norm = r;

        sp_3072_from_bin(a, 106, in, inLen);
#if DIGIT_BIT >= 64
        e[0] = (sp_uint64)em->dp[0];
#else
        e[0] = (sp_uint64)em->dp[0];
        if (em->used > 1) {
            e[0] |= ((sp_uint64)em->dp[1]) << DIGIT_BIT;
        }
#endif
        if (e[0] == 0) {
            err = MP_EXPTMOD_E;
        }
    }

    if (err == MP_OKAY) {
        sp_3072_from_mp(m, 106, mm);

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_106(norm, m);
    }
    if (err == MP_OKAY) {
        sp_3072_mul_106(a, a, norm);
        err = sp_3072_mod_106(a, a, m);
    }
    if (err == MP_OKAY) {
        for (i=63; i>=0; i--) {
            if ((e[0] >> i) != 0) {
                break;
            }
        }

        XMEMCPY(r, a, sizeof(sp_digit) * 106 * 2);
        for (i--; i>=0; i--) {
            sp_3072_mont_sqr_106(r, r, m, mp);

            if (((e[0] >> i) & 1) == 1) {
                sp_3072_mont_mul_106(r, r, a, m, mp);
            }
        }
        sp_3072_mont_reduce_106(r, m, mp);
        mp = sp_3072_cmp_106(r, m);
        sp_3072_cond_sub_106(r, r, m, ~(mp >> 31));

        sp_3072_to_bin_106(r, out);
        *outLen = 384;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (a != NULL)
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
#endif

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit d[106 * 5];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    sp_uint64 e[1] = {0};
    int err = MP_OKAY;

    if (*outLen < 384U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(em) > 64) {
            err = MP_READ_E;
        }
        else if (inLen > 384U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 3072) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 106 * 5, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d;
        r = a + 106 * 2;
        m = r + 106 * 2;

        sp_3072_from_bin(a, 106, in, inLen);
#if DIGIT_BIT >= 64
        e[0] = (sp_uint64)em->dp[0];
#else
        e[0] = (sp_uint64)em->dp[0];
        if (em->used > 1) {
            e[0] |= ((sp_uint64)em->dp[1]) << DIGIT_BIT;
        }
#endif
        if (e[0] == 0) {
            err = MP_EXPTMOD_E;
        }
    }
    if (err == MP_OKAY) {
        sp_3072_from_mp(m, 106, mm);

        if (e[0] == 0x3) {
            sp_3072_sqr_106(r, a);
            err = sp_3072_mod_106(r, r, m);
            if (err == MP_OKAY) {
                sp_3072_mul_106(r, a, r);
                err = sp_3072_mod_106(r, r, m);
            }
        }
        else {
            sp_digit* norm = r;
            int i;
            sp_digit mp;

            sp_3072_mont_setup(m, &mp);
            sp_3072_mont_norm_106(norm, m);

            sp_3072_mul_106(a, a, norm);
            err = sp_3072_mod_106(a, a, m);

            if (err == MP_OKAY) {
                for (i=63; i>=0; i--) {
                    if ((e[0] >> i) != 0) {
                        break;
                    }
                }

                XMEMCPY(r, a, sizeof(sp_digit) * 212U);
                for (i--; i>=0; i--) {
                    sp_3072_mont_sqr_106(r, r, m, mp);

                    if (((e[0] >> i) & 1) == 1) {
                        sp_3072_mont_mul_106(r, r, a, m, mp);
                    }
                }
                sp_3072_mont_reduce_106(r, m, mp);
                mp = sp_3072_cmp_106(r, m);
                sp_3072_cond_sub_106(r, r, m, ~(mp >> 31));
            }
        }
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin_106(r, out);
        *outLen = 384;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif

    return err;
#endif /* WOLFSSL_SP_SMALL */
}

#ifndef WOLFSSL_RSA_PUBLIC_ONLY
#if !defined(SP_RSA_PRIVATE_EXP_D) && !defined(RSA_LOW_MEM)
#endif /* !SP_RSA_PRIVATE_EXP_D & !RSA_LOW_MEM */
/* RSA private key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * dm      Private exponent.
 * pm      First prime.
 * qm      Second prime.
 * dpm     First prime's CRT exponent.
 * dqm     Second prime's CRT exponent.
 * qim     Inverse of second prime mod p.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 384 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
    const mp_int* pm, const mp_int* qm, const mp_int* dpm, const mp_int* dqm,
    const mp_int* qim, const mp_int* mm, byte* out, word32* outLen)
{
#if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM)
#if defined(WOLFSSL_SP_SMALL)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit  d[106 * 4];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 384U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(dm) > 3072) {
           err = MP_READ_E;
        }
        else if (inLen > 384) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 3072) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 106 * 4, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d + 106;
        m = a + 212;
        r = a;

        sp_3072_from_bin(a, 106, in, inLen);
        sp_3072_from_mp(d, 106, dm);
        sp_3072_from_mp(m, 106, mm);
        err = sp_3072_mod_exp_106(r, a, d, 3072, m, 0);
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin_106(r, out);
        *outLen = 384;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
#endif
    {
        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
        if (a != NULL)
            ForceZero(a, sizeof(sp_digit) * 106);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit d[106 * 4];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 384U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(dm) > 3072) {
            err = MP_READ_E;
        }
        else if (inLen > 384U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 3072) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 106 * 4, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d + 106;
        m = a + 212;
        r = a;

        sp_3072_from_bin(a, 106, in, inLen);
        sp_3072_from_mp(d, 106, dm);
        sp_3072_from_mp(m, 106, mm);
        err = sp_3072_mod_exp_106(r, a, d, 3072, m, 0);
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin_106(r, out);
        *outLen = 384;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
#endif
    {
        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
        if (a != NULL)
            ForceZero(a, sizeof(sp_digit) * 106);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#endif /* WOLFSSL_SP_SMALL */
#else
#if defined(WOLFSSL_SP_SMALL)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[53 * 8];
#endif
    sp_digit* p = NULL;
    sp_digit* dp = NULL;
    sp_digit* dq = NULL;
    sp_digit* qi = NULL;
    sp_digit* tmpa = NULL;
    sp_digit* tmpb = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 384U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (inLen > 384) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 3072) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(pm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(qm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 53 * 8, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif
    if (err == MP_OKAY) {
        p = a + 106;
        qi = dq = dp = p + 53;
        tmpa = qi + 53;
        tmpb = tmpa + 106;
        r = a;

        sp_3072_from_bin(a, 106, in, inLen);
        sp_3072_from_mp(p, 53, pm);
        sp_3072_from_mp(dp, 53, dpm);
        err = sp_3072_mod_exp_53(tmpa, a, dp, 1536, p, 1);
    }
    if (err == MP_OKAY) {
        sp_3072_from_mp(p, 53, qm);
        sp_3072_from_mp(dq, 53, dqm);
        err = sp_3072_mod_exp_53(tmpb, a, dq, 1536, p, 1);
    }
    if (err == MP_OKAY) {
        sp_3072_from_mp(p, 53, pm);
        (void)sp_3072_sub_53(tmpa, tmpa, tmpb);
        sp_3072_norm_53(tmpa);
        sp_3072_cond_add_53(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[52] >> 31));
        sp_3072_cond_add_53(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[52] >> 31));
        sp_3072_norm_53(tmpa);

        sp_3072_from_mp(qi, 53, qim);
        sp_3072_mul_53(tmpa, tmpa, qi);
        err = sp_3072_mod_53(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_3072_from_mp(p, 53, qm);
        sp_3072_mul_53(tmpa, p, tmpa);
        (void)sp_3072_add_106(r, tmpb, tmpa);
        sp_3072_norm_106(r);

        sp_3072_to_bin_106(r, out);
        *outLen = 384;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (a != NULL)
#endif
    {
        ForceZero(a, sizeof(sp_digit) * 53 * 8);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[53 * 13];
#endif
    sp_digit* p = NULL;
    sp_digit* q = NULL;
    sp_digit* dp = NULL;
    sp_digit* dq = NULL;
    sp_digit* qi = NULL;
    sp_digit* tmpa = NULL;
    sp_digit* tmpb = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 384U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (inLen > 384U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 3072) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(pm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(qm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 53 * 13, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        p = a + 106 * 2;
        q = p + 53;
        dp = q + 53;
        dq = dp + 53;
        qi = dq + 53;
        tmpa = qi + 53;
        tmpb = tmpa + 106;
        r = a;

        sp_3072_from_bin(a, 106, in, inLen);
        sp_3072_from_mp(p, 53, pm);
        sp_3072_from_mp(q, 53, qm);
        sp_3072_from_mp(dp, 53, dpm);
        sp_3072_from_mp(dq, 53, dqm);
        sp_3072_from_mp(qi, 53, qim);

        err = sp_3072_mod_exp_53(tmpa, a, dp, 1536, p, 1);
    }
    if (err == MP_OKAY) {
        err = sp_3072_mod_exp_53(tmpb, a, dq, 1536, q, 1);
    }

    if (err == MP_OKAY) {
        (void)sp_3072_sub_53(tmpa, tmpa, tmpb);
        sp_3072_norm_53(tmpa);
        sp_3072_cond_add_53(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[52] >> 31));
        sp_3072_cond_add_53(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[52] >> 31));
        sp_3072_norm_53(tmpa);
        sp_3072_mul_53(tmpa, tmpa, qi);
        err = sp_3072_mod_53(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_3072_mul_53(tmpa, tmpa, q);
        (void)sp_3072_add_106(r, tmpb, tmpa);
        sp_3072_norm_106(r);

        sp_3072_to_bin_106(r, out);
        *outLen = 384;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
if (a != NULL)
#endif
    {
        ForceZero(a, sizeof(sp_digit) * 53 * 13);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
    #endif
    }

    return err;
#endif /* WOLFSSL_SP_SMALL */
#endif /* SP_RSA_PRIVATE_EXP_D || RSA_LOW_MEM */
}

#endif /* !WOLFSSL_RSA_PUBLIC_ONLY */
#endif /* WOLFSSL_HAVE_SP_RSA */
#if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \
                                              !defined(WOLFSSL_RSA_PUBLIC_ONLY))
/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_3072_to_mp(const sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (3072 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
#if DIGIT_BIT == 29
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 106);
        r->used = 106;
        mp_clamp(r);
#elif DIGIT_BIT < 29
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 106; i++) {
            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 29) {
                s += DIGIT_BIT;
                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
                if (s == SP_WORD_SIZE) {
                    r->dp[j] = 0;
                }
                else {
                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 29 - s;
        }
        r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 106; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 29 >= DIGIT_BIT) {
    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 29 - s;
            }
            else {
                s += 29;
            }
        }
        r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod,
    mp_int* res)
{
#ifdef WOLFSSL_SP_SMALL
    int err = MP_OKAY;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[106 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 3072) {
        err = MP_READ_E;
    }
    else if (expBits > 3072) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 106 * 4, NULL,
            DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 106 * 2;
        m = e + 106;
        r = b;

        sp_3072_from_mp(b, 106, base);
        sp_3072_from_mp(e, 106, exp);
        sp_3072_from_mp(m, 106, mod);

        err = sp_3072_mod_exp_106(r, b, e, mp_count_bits(exp), m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_3072_to_mp(r, res);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 106U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }
    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[106 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 3072) {
        err = MP_READ_E;
    }
    else if (expBits > 3072) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 106 * 4, NULL, DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 106 * 2;
        m = e + 106;
        r = b;

        sp_3072_from_mp(b, 106, base);
        sp_3072_from_mp(e, 106, exp);
        sp_3072_from_mp(m, 106, mod);

        err = sp_3072_mod_exp_106(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_3072_to_mp(r, res);
    }


#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 106U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }

    return err;
#endif
}

#ifdef WOLFSSL_HAVE_SP_DH

#ifdef HAVE_FFDHE_3072
SP_NOINLINE static void sp_3072_lshift_106(sp_digit* r, const sp_digit* a,
        byte n)
{
    int i;

    r[106] = a[105] >> (29 - n);
    for (i=105; i>0; i--) {
        r[i] = ((a[i] << n) | (a[i-1] >> (29 - n))) & 0x1fffffff;
    }
    r[0] = (a[0] << n) & 0x1fffffff;
}

/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even.
 */
static int sp_3072_mod_exp_2_106(sp_digit* r, const sp_digit* e, int bits, const sp_digit* m)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[319];
#endif
    sp_digit* norm = NULL;
    sp_digit* tmp = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit o;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 319, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        tmp  = td + 212;
        XMEMSET(td, 0, sizeof(sp_digit) * 319);

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_106(norm, m);

        bits = ((bits + 3) / 4) * 4;
        i = ((bits + 28) / 29) - 1;
        c = bits % 29;
        if (c == 0) {
            c = 29;
        }
        if (i < 106) {
            n = e[i--] << (32 - c);
        }
        else {
            n = 0;
            i--;
        }
        if (c < 4) {
            n |= e[i--] << (3 - c);
            c += 29;
        }
        y = (int)((n >> 28) & 0xf);
        n <<= 4;
        c -= 4;
        sp_3072_lshift_106(r, norm, (byte)y);
        while ((i >= 0) || (c >= 4)) {
            if (c >= 4) {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }
            else if (c == 0) {
                n = e[i--] << 3;
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c = 25;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n = e[i--] << 3;
                c = 4 - c;
                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
                n <<= c;
                c = 29 - c;
            }

            sp_3072_mont_sqr_106(r, r, m, mp);
            sp_3072_mont_sqr_106(r, r, m, mp);
            sp_3072_mont_sqr_106(r, r, m, mp);
            sp_3072_mont_sqr_106(r, r, m, mp);

            sp_3072_lshift_106(r, r, (byte)y);
            sp_3072_mul_d_106(tmp, norm, (r[106] << 2) + (r[105] >> 27));
            r[106] = 0;
            r[105] &= 0x7ffffffL;
            (void)sp_3072_add_106(r, r, tmp);
            sp_3072_norm_106(r);
            o = sp_3072_cmp_106(r, m);
            sp_3072_cond_sub_106(r, r, m, ~(o >> 31));
        }

        sp_3072_mont_reduce_106(r, m, mp);
        n = sp_3072_cmp_106(r, m);
        sp_3072_cond_sub_106(r, r, m, ~(n >> 31));
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

#endif /* HAVE_FFDHE_3072 */

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base     Base.
 * exp      Array of bytes that is the exponent.
 * expLen   Length of data, in bytes, in exponent.
 * mod      Modulus.
 * out      Buffer to hold big-endian bytes of exponentiation result.
 *          Must be at least 384 bytes long.
 * outLen   Length, in bytes, of exponentiation result.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
    const mp_int* mod, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[106 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    word32 i;
    int err = MP_OKAY;

    if (mp_count_bits(base) > 3072) {
        err = MP_READ_E;
    }
    else if (expLen > 384U) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 106 * 4, NULL,
            DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 106 * 2;
        m = e + 106;
        r = b;

        sp_3072_from_mp(b, 106, base);
        sp_3072_from_bin(e, 106, exp, expLen);
        sp_3072_from_mp(m, 106, mod);

    #ifdef HAVE_FFDHE_3072
        if (base->used == 1 && base->dp[0] == 2U &&
                (m[105] >> 11) == 0xffffL) {
            err = sp_3072_mod_exp_2_106(r, e, expLen * 8U, m);
        }
        else {
    #endif
            err = sp_3072_mod_exp_106(r, b, e, expLen * 8U, m, 0);
    #ifdef HAVE_FFDHE_3072
        }
    #endif
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin_106(r, out);
        *outLen = 384;
        for (i=0; i<384U && out[i] == 0U; i++) {
            /* Search for first non-zero. */
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 106U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }

    return err;
}
#endif /* WOLFSSL_HAVE_SP_DH */

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod,
    mp_int* res)
{
#ifdef WOLFSSL_SP_SMALL
    int err = MP_OKAY;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[53 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 1536) {
        err = MP_READ_E;
    }
    else if (expBits > 1536) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 1536) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 53 * 4, NULL,
            DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 53 * 2;
        m = e + 53;
        r = b;

        sp_3072_from_mp(b, 53, base);
        sp_3072_from_mp(e, 53, exp);
        sp_3072_from_mp(m, 53, mod);

        err = sp_3072_mod_exp_53(r, b, e, mp_count_bits(exp), m, 0);
    }

    if (err == MP_OKAY) {
        XMEMSET(r + 53, 0, sizeof(*r) * 53U);
        err = sp_3072_to_mp(r, res);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 106U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }
    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[53 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 1536) {
        err = MP_READ_E;
    }
    else if (expBits > 1536) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 1536) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 53 * 4, NULL, DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 53 * 2;
        m = e + 53;
        r = b;

        sp_3072_from_mp(b, 53, base);
        sp_3072_from_mp(e, 53, exp);
        sp_3072_from_mp(m, 53, mod);

        err = sp_3072_mod_exp_53(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        XMEMSET(r + 53, 0, sizeof(*r) * 53U);
        err = sp_3072_to_mp(r, res);
    }


#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 106U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }

    return err;
#endif
}

#endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */

#else
/* Read big endian unsigned byte array into r.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_3072_from_bin(sp_digit* r, int size, const byte* a, int n)
{
    int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = n-1; i >= 0; i--) {
        r[j] |= (((sp_digit)a[i]) << s);
        if (s >= 20U) {
            r[j] &= 0xfffffff;
            s = 28U - s;
            if (j + 1 >= size) {
                break;
            }
            r[++j] = (sp_digit)a[i] >> s;
            s = 8U - s;
        }
        else {
            s += 8U;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  A multi-precision integer.
 */
static void sp_3072_from_mp(sp_digit* r, int size, const mp_int* a)
{
#if DIGIT_BIT == 28
    int i;
    sp_digit j = (sp_digit)0 - (sp_digit)a->used;
    int o = 0;

    for (i = 0; i < size; i++) {
        sp_digit mask = (sp_digit)0 - (j >> 27);
        r[i] = a->dp[o] & mask;
        j++;
        o += (int)(j >> 27);
    }
#elif DIGIT_BIT > 28
    unsigned int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i] << s);
        r[j] &= 0xfffffff;
        s = 28U - s;
        if (j + 1 >= size) {
            break;
        }
        /* lint allow cast of mismatch word32 and mp_digit */
        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
        while ((s + 28U) <= (word32)DIGIT_BIT) {
            s += 28U;
            r[j] &= 0xfffffff;
            if (j + 1 >= size) {
                break;
            }
            if (s < (word32)DIGIT_BIT) {
                /* lint allow cast of mismatch word32 and mp_digit */
                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
            }
            else {
                r[++j] = (sp_digit)0;
            }
        }
        s = (word32)DIGIT_BIT - s;
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#else
    unsigned int i;
    int j = 0;
    int s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 28) {
            r[j] &= 0xfffffff;
            if (j + 1 >= size) {
                break;
            }
            s = 28 - s;
            if (s == DIGIT_BIT) {
                r[++j] = 0;
                s = 0;
            }
            else {
                r[++j] = a->dp[i] >> s;
                s = DIGIT_BIT - s;
            }
        }
        else {
            s += DIGIT_BIT;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#endif
}

/* Write r as big endian to byte array.
 * Fixed length number of bytes written: 384
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_3072_to_bin_112(sp_digit* r, byte* a)
{
    int i;
    int j;
    int s = 0;
    int b;

    for (i=0; i<111; i++) {
        r[i+1] += r[i] >> 28;
        r[i] &= 0xfffffff;
    }
    j = 3079 / 8 - 1;
    a[j] = 0;
    for (i=0; i<110 && j>=0; i++) {
        b = 0;
        /* lint allow cast of mismatch sp_digit and int */
        a[j--] |= (byte)(r[i] << s); /*lint !e9033*/
        b += 8 - s;
        if (j < 0) {
            break;
        }
        while (b < 28) {
            a[j--] = (byte)(r[i] >> b);
            b += 8;
            if (j < 0) {
                break;
            }
        }
        s = 8 - (b - 28);
        if (j >= 0) {
            a[j] = 0;
        }
        if (s != 0) {
            j++;
        }
    }
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
/* Normalize the values in each word to 28 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_3072_norm_56(sp_digit* a)
{
    int i;
    for (i = 0; i < 48; i += 8) {
        a[i+1] += a[i+0] >> 28; a[i+0] &= 0xfffffff;
        a[i+2] += a[i+1] >> 28; a[i+1] &= 0xfffffff;
        a[i+3] += a[i+2] >> 28; a[i+2] &= 0xfffffff;
        a[i+4] += a[i+3] >> 28; a[i+3] &= 0xfffffff;
        a[i+5] += a[i+4] >> 28; a[i+4] &= 0xfffffff;
        a[i+6] += a[i+5] >> 28; a[i+5] &= 0xfffffff;
        a[i+7] += a[i+6] >> 28; a[i+6] &= 0xfffffff;
        a[i+8] += a[i+7] >> 28; a[i+7] &= 0xfffffff;
    }
    a[49] += a[48] >> 28; a[48] &= 0xfffffff;
    a[50] += a[49] >> 28; a[49] &= 0xfffffff;
    a[51] += a[50] >> 28; a[50] &= 0xfffffff;
    a[52] += a[51] >> 28; a[51] &= 0xfffffff;
    a[53] += a[52] >> 28; a[52] &= 0xfffffff;
    a[54] += a[53] >> 28; a[53] &= 0xfffffff;
    a[55] += a[54] >> 28; a[54] &= 0xfffffff;
}

#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
/* Normalize the values in each word to 28 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_3072_norm_55(sp_digit* a)
{
    int i;
    for (i = 0; i < 48; i += 8) {
        a[i+1] += a[i+0] >> 28; a[i+0] &= 0xfffffff;
        a[i+2] += a[i+1] >> 28; a[i+1] &= 0xfffffff;
        a[i+3] += a[i+2] >> 28; a[i+2] &= 0xfffffff;
        a[i+4] += a[i+3] >> 28; a[i+3] &= 0xfffffff;
        a[i+5] += a[i+4] >> 28; a[i+4] &= 0xfffffff;
        a[i+6] += a[i+5] >> 28; a[i+5] &= 0xfffffff;
        a[i+7] += a[i+6] >> 28; a[i+6] &= 0xfffffff;
        a[i+8] += a[i+7] >> 28; a[i+7] &= 0xfffffff;
    }
    a[49] += a[48] >> 28; a[48] &= 0xfffffff;
    a[50] += a[49] >> 28; a[49] &= 0xfffffff;
    a[51] += a[50] >> 28; a[50] &= 0xfffffff;
    a[52] += a[51] >> 28; a[51] &= 0xfffffff;
    a[53] += a[52] >> 28; a[52] &= 0xfffffff;
    a[54] += a[53] >> 28; a[53] &= 0xfffffff;
}

/* Normalize the values in each word to 28 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_3072_norm_112(sp_digit* a)
{
    int i;
    for (i = 0; i < 104; i += 8) {
        a[i+1] += a[i+0] >> 28; a[i+0] &= 0xfffffff;
        a[i+2] += a[i+1] >> 28; a[i+1] &= 0xfffffff;
        a[i+3] += a[i+2] >> 28; a[i+2] &= 0xfffffff;
        a[i+4] += a[i+3] >> 28; a[i+3] &= 0xfffffff;
        a[i+5] += a[i+4] >> 28; a[i+4] &= 0xfffffff;
        a[i+6] += a[i+5] >> 28; a[i+5] &= 0xfffffff;
        a[i+7] += a[i+6] >> 28; a[i+6] &= 0xfffffff;
        a[i+8] += a[i+7] >> 28; a[i+7] &= 0xfffffff;
    }
    a[105] += a[104] >> 28; a[104] &= 0xfffffff;
    a[106] += a[105] >> 28; a[105] &= 0xfffffff;
    a[107] += a[106] >> 28; a[106] &= 0xfffffff;
    a[108] += a[107] >> 28; a[107] &= 0xfffffff;
    a[109] += a[108] >> 28; a[108] &= 0xfffffff;
    a[110] += a[109] >> 28; a[109] &= 0xfffffff;
    a[111] += a[110] >> 28; a[110] &= 0xfffffff;
}

/* Normalize the values in each word to 28 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_3072_norm_110(sp_digit* a)
{
    int i;
    for (i = 0; i < 104; i += 8) {
        a[i+1] += a[i+0] >> 28; a[i+0] &= 0xfffffff;
        a[i+2] += a[i+1] >> 28; a[i+1] &= 0xfffffff;
        a[i+3] += a[i+2] >> 28; a[i+2] &= 0xfffffff;
        a[i+4] += a[i+3] >> 28; a[i+3] &= 0xfffffff;
        a[i+5] += a[i+4] >> 28; a[i+4] &= 0xfffffff;
        a[i+6] += a[i+5] >> 28; a[i+5] &= 0xfffffff;
        a[i+7] += a[i+6] >> 28; a[i+6] &= 0xfffffff;
        a[i+8] += a[i+7] >> 28; a[i+7] &= 0xfffffff;
    }
    a[105] += a[104] >> 28; a[104] &= 0xfffffff;
    a[106] += a[105] >> 28; a[105] &= 0xfffffff;
    a[107] += a[106] >> 28; a[106] &= 0xfffffff;
    a[108] += a[107] >> 28; a[107] &= 0xfffffff;
    a[109] += a[108] >> 28; a[108] &= 0xfffffff;
}

#ifndef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_3072_mul_14(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    sp_uint64 t0;
    sp_uint64 t1;
    sp_digit t[14];

    t0 = ((sp_uint64)a[ 0]) * b[ 0];
    t1 = ((sp_uint64)a[ 0]) * b[ 1]
       + ((sp_uint64)a[ 1]) * b[ 0];
    t[ 0] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = ((sp_uint64)a[ 0]) * b[ 2]
       + ((sp_uint64)a[ 1]) * b[ 1]
       + ((sp_uint64)a[ 2]) * b[ 0];
    t[ 1] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = ((sp_uint64)a[ 0]) * b[ 3]
       + ((sp_uint64)a[ 1]) * b[ 2]
       + ((sp_uint64)a[ 2]) * b[ 1]
       + ((sp_uint64)a[ 3]) * b[ 0];
    t[ 2] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = ((sp_uint64)a[ 0]) * b[ 4]
       + ((sp_uint64)a[ 1]) * b[ 3]
       + ((sp_uint64)a[ 2]) * b[ 2]
       + ((sp_uint64)a[ 3]) * b[ 1]
       + ((sp_uint64)a[ 4]) * b[ 0];
    t[ 3] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = ((sp_uint64)a[ 0]) * b[ 5]
       + ((sp_uint64)a[ 1]) * b[ 4]
       + ((sp_uint64)a[ 2]) * b[ 3]
       + ((sp_uint64)a[ 3]) * b[ 2]
       + ((sp_uint64)a[ 4]) * b[ 1]
       + ((sp_uint64)a[ 5]) * b[ 0];
    t[ 4] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = ((sp_uint64)a[ 0]) * b[ 6]
       + ((sp_uint64)a[ 1]) * b[ 5]
       + ((sp_uint64)a[ 2]) * b[ 4]
       + ((sp_uint64)a[ 3]) * b[ 3]
       + ((sp_uint64)a[ 4]) * b[ 2]
       + ((sp_uint64)a[ 5]) * b[ 1]
       + ((sp_uint64)a[ 6]) * b[ 0];
    t[ 5] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = ((sp_uint64)a[ 0]) * b[ 7]
       + ((sp_uint64)a[ 1]) * b[ 6]
       + ((sp_uint64)a[ 2]) * b[ 5]
       + ((sp_uint64)a[ 3]) * b[ 4]
       + ((sp_uint64)a[ 4]) * b[ 3]
       + ((sp_uint64)a[ 5]) * b[ 2]
       + ((sp_uint64)a[ 6]) * b[ 1]
       + ((sp_uint64)a[ 7]) * b[ 0];
    t[ 6] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = ((sp_uint64)a[ 0]) * b[ 8]
       + ((sp_uint64)a[ 1]) * b[ 7]
       + ((sp_uint64)a[ 2]) * b[ 6]
       + ((sp_uint64)a[ 3]) * b[ 5]
       + ((sp_uint64)a[ 4]) * b[ 4]
       + ((sp_uint64)a[ 5]) * b[ 3]
       + ((sp_uint64)a[ 6]) * b[ 2]
       + ((sp_uint64)a[ 7]) * b[ 1]
       + ((sp_uint64)a[ 8]) * b[ 0];
    t[ 7] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = ((sp_uint64)a[ 0]) * b[ 9]
       + ((sp_uint64)a[ 1]) * b[ 8]
       + ((sp_uint64)a[ 2]) * b[ 7]
       + ((sp_uint64)a[ 3]) * b[ 6]
       + ((sp_uint64)a[ 4]) * b[ 5]
       + ((sp_uint64)a[ 5]) * b[ 4]
       + ((sp_uint64)a[ 6]) * b[ 3]
       + ((sp_uint64)a[ 7]) * b[ 2]
       + ((sp_uint64)a[ 8]) * b[ 1]
       + ((sp_uint64)a[ 9]) * b[ 0];
    t[ 8] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = ((sp_uint64)a[ 0]) * b[10]
       + ((sp_uint64)a[ 1]) * b[ 9]
       + ((sp_uint64)a[ 2]) * b[ 8]
       + ((sp_uint64)a[ 3]) * b[ 7]
       + ((sp_uint64)a[ 4]) * b[ 6]
       + ((sp_uint64)a[ 5]) * b[ 5]
       + ((sp_uint64)a[ 6]) * b[ 4]
       + ((sp_uint64)a[ 7]) * b[ 3]
       + ((sp_uint64)a[ 8]) * b[ 2]
       + ((sp_uint64)a[ 9]) * b[ 1]
       + ((sp_uint64)a[10]) * b[ 0];
    t[ 9] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = ((sp_uint64)a[ 0]) * b[11]
       + ((sp_uint64)a[ 1]) * b[10]
       + ((sp_uint64)a[ 2]) * b[ 9]
       + ((sp_uint64)a[ 3]) * b[ 8]
       + ((sp_uint64)a[ 4]) * b[ 7]
       + ((sp_uint64)a[ 5]) * b[ 6]
       + ((sp_uint64)a[ 6]) * b[ 5]
       + ((sp_uint64)a[ 7]) * b[ 4]
       + ((sp_uint64)a[ 8]) * b[ 3]
       + ((sp_uint64)a[ 9]) * b[ 2]
       + ((sp_uint64)a[10]) * b[ 1]
       + ((sp_uint64)a[11]) * b[ 0];
    t[10] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = ((sp_uint64)a[ 0]) * b[12]
       + ((sp_uint64)a[ 1]) * b[11]
       + ((sp_uint64)a[ 2]) * b[10]
       + ((sp_uint64)a[ 3]) * b[ 9]
       + ((sp_uint64)a[ 4]) * b[ 8]
       + ((sp_uint64)a[ 5]) * b[ 7]
       + ((sp_uint64)a[ 6]) * b[ 6]
       + ((sp_uint64)a[ 7]) * b[ 5]
       + ((sp_uint64)a[ 8]) * b[ 4]
       + ((sp_uint64)a[ 9]) * b[ 3]
       + ((sp_uint64)a[10]) * b[ 2]
       + ((sp_uint64)a[11]) * b[ 1]
       + ((sp_uint64)a[12]) * b[ 0];
    t[11] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = ((sp_uint64)a[ 0]) * b[13]
       + ((sp_uint64)a[ 1]) * b[12]
       + ((sp_uint64)a[ 2]) * b[11]
       + ((sp_uint64)a[ 3]) * b[10]
       + ((sp_uint64)a[ 4]) * b[ 9]
       + ((sp_uint64)a[ 5]) * b[ 8]
       + ((sp_uint64)a[ 6]) * b[ 7]
       + ((sp_uint64)a[ 7]) * b[ 6]
       + ((sp_uint64)a[ 8]) * b[ 5]
       + ((sp_uint64)a[ 9]) * b[ 4]
       + ((sp_uint64)a[10]) * b[ 3]
       + ((sp_uint64)a[11]) * b[ 2]
       + ((sp_uint64)a[12]) * b[ 1]
       + ((sp_uint64)a[13]) * b[ 0];
    t[12] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = ((sp_uint64)a[ 1]) * b[13]
       + ((sp_uint64)a[ 2]) * b[12]
       + ((sp_uint64)a[ 3]) * b[11]
       + ((sp_uint64)a[ 4]) * b[10]
       + ((sp_uint64)a[ 5]) * b[ 9]
       + ((sp_uint64)a[ 6]) * b[ 8]
       + ((sp_uint64)a[ 7]) * b[ 7]
       + ((sp_uint64)a[ 8]) * b[ 6]
       + ((sp_uint64)a[ 9]) * b[ 5]
       + ((sp_uint64)a[10]) * b[ 4]
       + ((sp_uint64)a[11]) * b[ 3]
       + ((sp_uint64)a[12]) * b[ 2]
       + ((sp_uint64)a[13]) * b[ 1];
    t[13] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = ((sp_uint64)a[ 2]) * b[13]
       + ((sp_uint64)a[ 3]) * b[12]
       + ((sp_uint64)a[ 4]) * b[11]
       + ((sp_uint64)a[ 5]) * b[10]
       + ((sp_uint64)a[ 6]) * b[ 9]
       + ((sp_uint64)a[ 7]) * b[ 8]
       + ((sp_uint64)a[ 8]) * b[ 7]
       + ((sp_uint64)a[ 9]) * b[ 6]
       + ((sp_uint64)a[10]) * b[ 5]
       + ((sp_uint64)a[11]) * b[ 4]
       + ((sp_uint64)a[12]) * b[ 3]
       + ((sp_uint64)a[13]) * b[ 2];
    r[14] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = ((sp_uint64)a[ 3]) * b[13]
       + ((sp_uint64)a[ 4]) * b[12]
       + ((sp_uint64)a[ 5]) * b[11]
       + ((sp_uint64)a[ 6]) * b[10]
       + ((sp_uint64)a[ 7]) * b[ 9]
       + ((sp_uint64)a[ 8]) * b[ 8]
       + ((sp_uint64)a[ 9]) * b[ 7]
       + ((sp_uint64)a[10]) * b[ 6]
       + ((sp_uint64)a[11]) * b[ 5]
       + ((sp_uint64)a[12]) * b[ 4]
       + ((sp_uint64)a[13]) * b[ 3];
    r[15] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = ((sp_uint64)a[ 4]) * b[13]
       + ((sp_uint64)a[ 5]) * b[12]
       + ((sp_uint64)a[ 6]) * b[11]
       + ((sp_uint64)a[ 7]) * b[10]
       + ((sp_uint64)a[ 8]) * b[ 9]
       + ((sp_uint64)a[ 9]) * b[ 8]
       + ((sp_uint64)a[10]) * b[ 7]
       + ((sp_uint64)a[11]) * b[ 6]
       + ((sp_uint64)a[12]) * b[ 5]
       + ((sp_uint64)a[13]) * b[ 4];
    r[16] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = ((sp_uint64)a[ 5]) * b[13]
       + ((sp_uint64)a[ 6]) * b[12]
       + ((sp_uint64)a[ 7]) * b[11]
       + ((sp_uint64)a[ 8]) * b[10]
       + ((sp_uint64)a[ 9]) * b[ 9]
       + ((sp_uint64)a[10]) * b[ 8]
       + ((sp_uint64)a[11]) * b[ 7]
       + ((sp_uint64)a[12]) * b[ 6]
       + ((sp_uint64)a[13]) * b[ 5];
    r[17] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = ((sp_uint64)a[ 6]) * b[13]
       + ((sp_uint64)a[ 7]) * b[12]
       + ((sp_uint64)a[ 8]) * b[11]
       + ((sp_uint64)a[ 9]) * b[10]
       + ((sp_uint64)a[10]) * b[ 9]
       + ((sp_uint64)a[11]) * b[ 8]
       + ((sp_uint64)a[12]) * b[ 7]
       + ((sp_uint64)a[13]) * b[ 6];
    r[18] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = ((sp_uint64)a[ 7]) * b[13]
       + ((sp_uint64)a[ 8]) * b[12]
       + ((sp_uint64)a[ 9]) * b[11]
       + ((sp_uint64)a[10]) * b[10]
       + ((sp_uint64)a[11]) * b[ 9]
       + ((sp_uint64)a[12]) * b[ 8]
       + ((sp_uint64)a[13]) * b[ 7];
    r[19] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = ((sp_uint64)a[ 8]) * b[13]
       + ((sp_uint64)a[ 9]) * b[12]
       + ((sp_uint64)a[10]) * b[11]
       + ((sp_uint64)a[11]) * b[10]
       + ((sp_uint64)a[12]) * b[ 9]
       + ((sp_uint64)a[13]) * b[ 8];
    r[20] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = ((sp_uint64)a[ 9]) * b[13]
       + ((sp_uint64)a[10]) * b[12]
       + ((sp_uint64)a[11]) * b[11]
       + ((sp_uint64)a[12]) * b[10]
       + ((sp_uint64)a[13]) * b[ 9];
    r[21] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = ((sp_uint64)a[10]) * b[13]
       + ((sp_uint64)a[11]) * b[12]
       + ((sp_uint64)a[12]) * b[11]
       + ((sp_uint64)a[13]) * b[10];
    r[22] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = ((sp_uint64)a[11]) * b[13]
       + ((sp_uint64)a[12]) * b[12]
       + ((sp_uint64)a[13]) * b[11];
    r[23] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = ((sp_uint64)a[12]) * b[13]
       + ((sp_uint64)a[13]) * b[12];
    r[24] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = ((sp_uint64)a[13]) * b[13];
    r[25] = t1 & 0xfffffff; t0 += t1 >> 28;
    r[26] = t0 & 0xfffffff;
    r[27] = (sp_digit)(t0 >> 28);
    XMEMCPY(r, t, sizeof(t));
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_3072_add_14(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    r[ 0] = a[ 0] + b[ 0];
    r[ 1] = a[ 1] + b[ 1];
    r[ 2] = a[ 2] + b[ 2];
    r[ 3] = a[ 3] + b[ 3];
    r[ 4] = a[ 4] + b[ 4];
    r[ 5] = a[ 5] + b[ 5];
    r[ 6] = a[ 6] + b[ 6];
    r[ 7] = a[ 7] + b[ 7];
    r[ 8] = a[ 8] + b[ 8];
    r[ 9] = a[ 9] + b[ 9];
    r[10] = a[10] + b[10];
    r[11] = a[11] + b[11];
    r[12] = a[12] + b[12];
    r[13] = a[13] + b[13];

    return 0;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_3072_add_28(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 24; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[24] = a[24] + b[24];
    r[25] = a[25] + b[25];
    r[26] = a[26] + b[26];
    r[27] = a[27] + b[27];

    return 0;
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_3072_sub_28(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 24; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }
    r[24] = a[24] - b[24];
    r[25] = a[25] - b[25];
    r[26] = a[26] - b[26];
    r[27] = a[27] - b[27];

    return 0;
}

/* Normalize the values in each word to 28 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_3072_norm_14(sp_digit* a)
{
    a[1] += a[0] >> 28; a[0] &= 0xfffffff;
    a[2] += a[1] >> 28; a[1] &= 0xfffffff;
    a[3] += a[2] >> 28; a[2] &= 0xfffffff;
    a[4] += a[3] >> 28; a[3] &= 0xfffffff;
    a[5] += a[4] >> 28; a[4] &= 0xfffffff;
    a[6] += a[5] >> 28; a[5] &= 0xfffffff;
    a[7] += a[6] >> 28; a[6] &= 0xfffffff;
    a[8] += a[7] >> 28; a[7] &= 0xfffffff;
    a[9] += a[8] >> 28; a[8] &= 0xfffffff;
    a[10] += a[9] >> 28; a[9] &= 0xfffffff;
    a[11] += a[10] >> 28; a[10] &= 0xfffffff;
    a[12] += a[11] >> 28; a[11] &= 0xfffffff;
    a[13] += a[12] >> 28; a[12] &= 0xfffffff;
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_3072_mul_28(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[28];
    sp_digit* a1 = z1;
    sp_digit b1[14];
    sp_digit* z2 = r + 28;
    (void)sp_3072_add_14(a1, a, &a[14]);
    sp_3072_norm_14(a1);
    (void)sp_3072_add_14(b1, b, &b[14]);
    sp_3072_norm_14(b1);
    sp_3072_mul_14(z2, &a[14], &b[14]);
    sp_3072_mul_14(z0, a, b);
    sp_3072_mul_14(z1, a1, b1);
    (void)sp_3072_sub_28(z1, z1, z2);
    (void)sp_3072_sub_28(z1, z1, z0);
    (void)sp_3072_add_28(r + 14, r + 14, z1);
    sp_3072_norm_56(r);
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_3072_add_56(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 56; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }

    return 0;
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_3072_sub_56(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 56; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }

    return 0;
}

/* Normalize the values in each word to 28 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_3072_norm_28(sp_digit* a)
{
    int i;
    for (i = 0; i < 24; i += 8) {
        a[i+1] += a[i+0] >> 28; a[i+0] &= 0xfffffff;
        a[i+2] += a[i+1] >> 28; a[i+1] &= 0xfffffff;
        a[i+3] += a[i+2] >> 28; a[i+2] &= 0xfffffff;
        a[i+4] += a[i+3] >> 28; a[i+3] &= 0xfffffff;
        a[i+5] += a[i+4] >> 28; a[i+4] &= 0xfffffff;
        a[i+6] += a[i+5] >> 28; a[i+5] &= 0xfffffff;
        a[i+7] += a[i+6] >> 28; a[i+6] &= 0xfffffff;
        a[i+8] += a[i+7] >> 28; a[i+7] &= 0xfffffff;
    }
    a[25] += a[24] >> 28; a[24] &= 0xfffffff;
    a[26] += a[25] >> 28; a[25] &= 0xfffffff;
    a[27] += a[26] >> 28; a[26] &= 0xfffffff;
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_3072_mul_56(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[56];
    sp_digit* a1 = z1;
    sp_digit b1[28];
    sp_digit* z2 = r + 56;
    (void)sp_3072_add_28(a1, a, &a[28]);
    sp_3072_norm_28(a1);
    (void)sp_3072_add_28(b1, b, &b[28]);
    sp_3072_norm_28(b1);
    sp_3072_mul_28(z2, &a[28], &b[28]);
    sp_3072_mul_28(z0, a, b);
    sp_3072_mul_28(z1, a1, b1);
    (void)sp_3072_sub_56(z1, z1, z2);
    (void)sp_3072_sub_56(z1, z1, z0);
    (void)sp_3072_add_56(r + 28, r + 28, z1);
    sp_3072_norm_112(r);
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_3072_add_112(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 112; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }

    return 0;
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_3072_sub_112(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 112; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }

    return 0;
}

/* Normalize the values in each word to 28 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_3072_norm_224(sp_digit* a)
{
    int i;
    for (i = 0; i < 216; i += 8) {
        a[i+1] += a[i+0] >> 28; a[i+0] &= 0xfffffff;
        a[i+2] += a[i+1] >> 28; a[i+1] &= 0xfffffff;
        a[i+3] += a[i+2] >> 28; a[i+2] &= 0xfffffff;
        a[i+4] += a[i+3] >> 28; a[i+3] &= 0xfffffff;
        a[i+5] += a[i+4] >> 28; a[i+4] &= 0xfffffff;
        a[i+6] += a[i+5] >> 28; a[i+5] &= 0xfffffff;
        a[i+7] += a[i+6] >> 28; a[i+6] &= 0xfffffff;
        a[i+8] += a[i+7] >> 28; a[i+7] &= 0xfffffff;
    }
    a[217] += a[216] >> 28; a[216] &= 0xfffffff;
    a[218] += a[217] >> 28; a[217] &= 0xfffffff;
    a[219] += a[218] >> 28; a[218] &= 0xfffffff;
    a[220] += a[219] >> 28; a[219] &= 0xfffffff;
    a[221] += a[220] >> 28; a[220] &= 0xfffffff;
    a[222] += a[221] >> 28; a[221] &= 0xfffffff;
    a[223] += a[222] >> 28; a[222] &= 0xfffffff;
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_3072_mul_112(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[112];
    sp_digit* a1 = z1;
    sp_digit b1[56];
    sp_digit* z2 = r + 112;
    (void)sp_3072_add_56(a1, a, &a[56]);
    sp_3072_norm_56(a1);
    (void)sp_3072_add_56(b1, b, &b[56]);
    sp_3072_norm_56(b1);
    sp_3072_mul_56(z2, &a[56], &b[56]);
    sp_3072_mul_56(z0, a, b);
    sp_3072_mul_56(z1, a1, b1);
    (void)sp_3072_sub_112(z1, z1, z2);
    (void)sp_3072_sub_112(z1, z1, z0);
    (void)sp_3072_add_112(r + 56, r + 56, z1);
    sp_3072_norm_224(r);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_3072_sqr_14(sp_digit* r, const sp_digit* a)
{
    sp_uint64 t0;
    sp_uint64 t1;
    sp_digit t[14];

    t0 =  ((sp_uint64)a[ 0]) * a[ 0];
    t1 = (((sp_uint64)a[ 0]) * a[ 1]) * 2;
    t[ 0] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = (((sp_uint64)a[ 0]) * a[ 2]) * 2
       +  ((sp_uint64)a[ 1]) * a[ 1];
    t[ 1] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = (((sp_uint64)a[ 0]) * a[ 3]
       +  ((sp_uint64)a[ 1]) * a[ 2]) * 2;
    t[ 2] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = (((sp_uint64)a[ 0]) * a[ 4]
       +  ((sp_uint64)a[ 1]) * a[ 3]) * 2
       +  ((sp_uint64)a[ 2]) * a[ 2];
    t[ 3] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = (((sp_uint64)a[ 0]) * a[ 5]
       +  ((sp_uint64)a[ 1]) * a[ 4]
       +  ((sp_uint64)a[ 2]) * a[ 3]) * 2;
    t[ 4] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = (((sp_uint64)a[ 0]) * a[ 6]
       +  ((sp_uint64)a[ 1]) * a[ 5]
       +  ((sp_uint64)a[ 2]) * a[ 4]) * 2
       +  ((sp_uint64)a[ 3]) * a[ 3];
    t[ 5] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = (((sp_uint64)a[ 0]) * a[ 7]
       +  ((sp_uint64)a[ 1]) * a[ 6]
       +  ((sp_uint64)a[ 2]) * a[ 5]
       +  ((sp_uint64)a[ 3]) * a[ 4]) * 2;
    t[ 6] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = (((sp_uint64)a[ 0]) * a[ 8]
       +  ((sp_uint64)a[ 1]) * a[ 7]
       +  ((sp_uint64)a[ 2]) * a[ 6]
       +  ((sp_uint64)a[ 3]) * a[ 5]) * 2
       +  ((sp_uint64)a[ 4]) * a[ 4];
    t[ 7] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = (((sp_uint64)a[ 0]) * a[ 9]
       +  ((sp_uint64)a[ 1]) * a[ 8]
       +  ((sp_uint64)a[ 2]) * a[ 7]
       +  ((sp_uint64)a[ 3]) * a[ 6]
       +  ((sp_uint64)a[ 4]) * a[ 5]) * 2;
    t[ 8] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = (((sp_uint64)a[ 0]) * a[10]
       +  ((sp_uint64)a[ 1]) * a[ 9]
       +  ((sp_uint64)a[ 2]) * a[ 8]
       +  ((sp_uint64)a[ 3]) * a[ 7]
       +  ((sp_uint64)a[ 4]) * a[ 6]) * 2
       +  ((sp_uint64)a[ 5]) * a[ 5];
    t[ 9] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = (((sp_uint64)a[ 0]) * a[11]
       +  ((sp_uint64)a[ 1]) * a[10]
       +  ((sp_uint64)a[ 2]) * a[ 9]
       +  ((sp_uint64)a[ 3]) * a[ 8]
       +  ((sp_uint64)a[ 4]) * a[ 7]
       +  ((sp_uint64)a[ 5]) * a[ 6]) * 2;
    t[10] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = (((sp_uint64)a[ 0]) * a[12]
       +  ((sp_uint64)a[ 1]) * a[11]
       +  ((sp_uint64)a[ 2]) * a[10]
       +  ((sp_uint64)a[ 3]) * a[ 9]
       +  ((sp_uint64)a[ 4]) * a[ 8]
       +  ((sp_uint64)a[ 5]) * a[ 7]) * 2
       +  ((sp_uint64)a[ 6]) * a[ 6];
    t[11] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = (((sp_uint64)a[ 0]) * a[13]
       +  ((sp_uint64)a[ 1]) * a[12]
       +  ((sp_uint64)a[ 2]) * a[11]
       +  ((sp_uint64)a[ 3]) * a[10]
       +  ((sp_uint64)a[ 4]) * a[ 9]
       +  ((sp_uint64)a[ 5]) * a[ 8]
       +  ((sp_uint64)a[ 6]) * a[ 7]) * 2;
    t[12] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = (((sp_uint64)a[ 1]) * a[13]
       +  ((sp_uint64)a[ 2]) * a[12]
       +  ((sp_uint64)a[ 3]) * a[11]
       +  ((sp_uint64)a[ 4]) * a[10]
       +  ((sp_uint64)a[ 5]) * a[ 9]
       +  ((sp_uint64)a[ 6]) * a[ 8]) * 2
       +  ((sp_uint64)a[ 7]) * a[ 7];
    t[13] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = (((sp_uint64)a[ 2]) * a[13]
       +  ((sp_uint64)a[ 3]) * a[12]
       +  ((sp_uint64)a[ 4]) * a[11]
       +  ((sp_uint64)a[ 5]) * a[10]
       +  ((sp_uint64)a[ 6]) * a[ 9]
       +  ((sp_uint64)a[ 7]) * a[ 8]) * 2;
    r[14] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = (((sp_uint64)a[ 3]) * a[13]
       +  ((sp_uint64)a[ 4]) * a[12]
       +  ((sp_uint64)a[ 5]) * a[11]
       +  ((sp_uint64)a[ 6]) * a[10]
       +  ((sp_uint64)a[ 7]) * a[ 9]) * 2
       +  ((sp_uint64)a[ 8]) * a[ 8];
    r[15] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = (((sp_uint64)a[ 4]) * a[13]
       +  ((sp_uint64)a[ 5]) * a[12]
       +  ((sp_uint64)a[ 6]) * a[11]
       +  ((sp_uint64)a[ 7]) * a[10]
       +  ((sp_uint64)a[ 8]) * a[ 9]) * 2;
    r[16] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = (((sp_uint64)a[ 5]) * a[13]
       +  ((sp_uint64)a[ 6]) * a[12]
       +  ((sp_uint64)a[ 7]) * a[11]
       +  ((sp_uint64)a[ 8]) * a[10]) * 2
       +  ((sp_uint64)a[ 9]) * a[ 9];
    r[17] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = (((sp_uint64)a[ 6]) * a[13]
       +  ((sp_uint64)a[ 7]) * a[12]
       +  ((sp_uint64)a[ 8]) * a[11]
       +  ((sp_uint64)a[ 9]) * a[10]) * 2;
    r[18] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = (((sp_uint64)a[ 7]) * a[13]
       +  ((sp_uint64)a[ 8]) * a[12]
       +  ((sp_uint64)a[ 9]) * a[11]) * 2
       +  ((sp_uint64)a[10]) * a[10];
    r[19] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = (((sp_uint64)a[ 8]) * a[13]
       +  ((sp_uint64)a[ 9]) * a[12]
       +  ((sp_uint64)a[10]) * a[11]) * 2;
    r[20] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = (((sp_uint64)a[ 9]) * a[13]
       +  ((sp_uint64)a[10]) * a[12]) * 2
       +  ((sp_uint64)a[11]) * a[11];
    r[21] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = (((sp_uint64)a[10]) * a[13]
       +  ((sp_uint64)a[11]) * a[12]) * 2;
    r[22] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 = (((sp_uint64)a[11]) * a[13]) * 2
       +  ((sp_uint64)a[12]) * a[12];
    r[23] = t1 & 0xfffffff; t0 += t1 >> 28;
    t1 = (((sp_uint64)a[12]) * a[13]) * 2;
    r[24] = t0 & 0xfffffff; t1 += t0 >> 28;
    t0 =  ((sp_uint64)a[13]) * a[13];
    r[25] = t1 & 0xfffffff; t0 += t1 >> 28;
    r[26] = t0 & 0xfffffff;
    r[27] = (sp_digit)(t0 >> 28);
    XMEMCPY(r, t, sizeof(t));
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_3072_sqr_28(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit z1[28];
    sp_digit* a1 = z1;
    sp_digit* z2 = r + 28;
    (void)sp_3072_add_14(a1, a, &a[14]);
    sp_3072_norm_14(a1);
    sp_3072_sqr_14(z2, &a[14]);
    sp_3072_sqr_14(z0, a);
    sp_3072_sqr_14(z1, a1);
    (void)sp_3072_sub_28(z1, z1, z2);
    (void)sp_3072_sub_28(z1, z1, z0);
    (void)sp_3072_add_28(r + 14, r + 14, z1);
    sp_3072_norm_56(r);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_3072_sqr_56(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit z1[56];
    sp_digit* a1 = z1;
    sp_digit* z2 = r + 56;
    (void)sp_3072_add_28(a1, a, &a[28]);
    sp_3072_norm_28(a1);
    sp_3072_sqr_28(z2, &a[28]);
    sp_3072_sqr_28(z0, a);
    sp_3072_sqr_28(z1, a1);
    (void)sp_3072_sub_56(z1, z1, z2);
    (void)sp_3072_sub_56(z1, z1, z0);
    (void)sp_3072_add_56(r + 28, r + 28, z1);
    sp_3072_norm_112(r);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_3072_sqr_112(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit z1[112];
    sp_digit* a1 = z1;
    sp_digit* z2 = r + 112;
    (void)sp_3072_add_56(a1, a, &a[56]);
    sp_3072_norm_56(a1);
    sp_3072_sqr_56(z2, &a[56]);
    sp_3072_sqr_56(z0, a);
    sp_3072_sqr_56(z1, a1);
    (void)sp_3072_sub_112(z1, z1, z2);
    (void)sp_3072_sub_112(z1, z1, z0);
    (void)sp_3072_add_112(r + 56, r + 56, z1);
    sp_3072_norm_224(r);
}

#endif /* !WOLFSSL_SP_SMALL */
/* Calculate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_3072_mont_setup(const sp_digit* a, sp_digit* rho)
{
    sp_digit x;
    sp_digit b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
    x &= 0xfffffff;

    /* rho = -1/m mod b */
    *rho = ((sp_digit)1 << 28) - x;
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_3072_mul_d_112(sp_digit* r, const sp_digit* a,
    sp_digit b)
{
    sp_int64 tb = b;
    sp_int64 t = 0;
    sp_digit t2;
    sp_int64 p[4];
    int i;

    for (i = 0; i < 112; i += 4) {
        p[0] = tb * a[i + 0];
        p[1] = tb * a[i + 1];
        p[2] = tb * a[i + 2];
        p[3] = tb * a[i + 3];
        t += p[0];
        t2 = (sp_digit)(t & 0xfffffff);
        t >>= 28;
        r[i + 0] = (sp_digit)t2;
        t += p[1];
        t2 = (sp_digit)(t & 0xfffffff);
        t >>= 28;
        r[i + 1] = (sp_digit)t2;
        t += p[2];
        t2 = (sp_digit)(t & 0xfffffff);
        t >>= 28;
        r[i + 2] = (sp_digit)t2;
        t += p[3];
        t2 = (sp_digit)(t & 0xfffffff);
        t >>= 28;
        r[i + 3] = (sp_digit)t2;
    }
    r[112] = (sp_digit)(t & 0xfffffff);
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 3072 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_3072_mont_norm_56(sp_digit* r, const sp_digit* m)
{
    /* Set r = 2^n - 1. */
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i + 0] = 0xfffffff;
        r[i + 1] = 0xfffffff;
        r[i + 2] = 0xfffffff;
        r[i + 3] = 0xfffffff;
        r[i + 4] = 0xfffffff;
        r[i + 5] = 0xfffffff;
        r[i + 6] = 0xfffffff;
        r[i + 7] = 0xfffffff;
    }
    r[48] = 0xfffffff;
    r[49] = 0xfffffff;
    r[50] = 0xfffffff;
    r[51] = 0xfffffff;
    r[52] = 0xfffffff;
    r[53] = 0xfffffff;
    r[54] = 0xffffffL;
    r[55] = 0;

    /* r = (2^n - 1) mod n */
    (void)sp_3072_sub_56(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_3072_cmp_56(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
    int i;

    for (i = 48; i >= 0; i -= 8) {
        r |= (a[i + 7] - b[i + 7]) & ~(((sp_digit)0 - r) >> 27);
        r |= (a[i + 6] - b[i + 6]) & ~(((sp_digit)0 - r) >> 27);
        r |= (a[i + 5] - b[i + 5]) & ~(((sp_digit)0 - r) >> 27);
        r |= (a[i + 4] - b[i + 4]) & ~(((sp_digit)0 - r) >> 27);
        r |= (a[i + 3] - b[i + 3]) & ~(((sp_digit)0 - r) >> 27);
        r |= (a[i + 2] - b[i + 2]) & ~(((sp_digit)0 - r) >> 27);
        r |= (a[i + 1] - b[i + 1]) & ~(((sp_digit)0 - r) >> 27);
        r |= (a[i + 0] - b[i + 0]) & ~(((sp_digit)0 - r) >> 27);
    }

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_3072_cond_sub_56(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 56; i += 8) {
        r[i + 0] = a[i + 0] - (b[i + 0] & m);
        r[i + 1] = a[i + 1] - (b[i + 1] & m);
        r[i + 2] = a[i + 2] - (b[i + 2] & m);
        r[i + 3] = a[i + 3] - (b[i + 3] & m);
        r[i + 4] = a[i + 4] - (b[i + 4] & m);
        r[i + 5] = a[i + 5] - (b[i + 5] & m);
        r[i + 6] = a[i + 6] - (b[i + 6] & m);
        r[i + 7] = a[i + 7] - (b[i + 7] & m);
    }
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_3072_mul_add_56(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifndef WOLFSSL_SP_LARGE_CODE
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 56; i++) {
        t += r[i];
        t += tb * a[i];
        r[i] = ((sp_digit)t) & 0xfffffff;
        t >>= 28;
    }
    r[56] += (sp_digit)t;
#else
    sp_int64 tb = b;
    sp_int64 t[8];
    int i;

    t[0] = 0;
    for (i = 0; i < 48; i += 8) {
        t[0] += (tb * a[i+0]) + r[i+0];
        t[1]  = (tb * a[i+1]) + r[i+1];
        t[2]  = (tb * a[i+2]) + r[i+2];
        t[3]  = (tb * a[i+3]) + r[i+3];
        t[4]  = (tb * a[i+4]) + r[i+4];
        t[5]  = (tb * a[i+5]) + r[i+5];
        t[6]  = (tb * a[i+6]) + r[i+6];
        t[7]  = (tb * a[i+7]) + r[i+7];
        r[i+0] = t[0] & 0xfffffff;
        t[1] += t[0] >> 28;
        r[i+1] = t[1] & 0xfffffff;
        t[2] += t[1] >> 28;
        r[i+2] = t[2] & 0xfffffff;
        t[3] += t[2] >> 28;
        r[i+3] = t[3] & 0xfffffff;
        t[4] += t[3] >> 28;
        r[i+4] = t[4] & 0xfffffff;
        t[5] += t[4] >> 28;
        r[i+5] = t[5] & 0xfffffff;
        t[6] += t[5] >> 28;
        r[i+6] = t[6] & 0xfffffff;
        t[7] += t[6] >> 28;
        r[i+7] = t[7] & 0xfffffff;
        t[0]  = t[7] >> 28;
    }
    t[0] += (tb * a[48]) + r[48];
    t[1]  = (tb * a[49]) + r[49];
    t[2]  = (tb * a[50]) + r[50];
    t[3]  = (tb * a[51]) + r[51];
    t[4]  = (tb * a[52]) + r[52];
    t[5]  = (tb * a[53]) + r[53];
    t[6]  = (tb * a[54]) + r[54];
    t[7]  = (tb * a[55]) + r[55];
    r[48] = t[0] & 0xfffffff;
    t[1] += t[0] >> 28;
    r[49] = t[1] & 0xfffffff;
    t[2] += t[1] >> 28;
    r[50] = t[2] & 0xfffffff;
    t[3] += t[2] >> 28;
    r[51] = t[3] & 0xfffffff;
    t[4] += t[3] >> 28;
    r[52] = t[4] & 0xfffffff;
    t[5] += t[4] >> 28;
    r[53] = t[5] & 0xfffffff;
    t[6] += t[5] >> 28;
    r[54] = t[6] & 0xfffffff;
    t[7] += t[6] >> 28;
    r[55] = t[7] & 0xfffffff;
    r[56] +=  (sp_digit)(t[7] >> 28);
#endif /* !WOLFSSL_SP_LARGE_CODE */
}

/* Shift the result in the high 1536 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_3072_mont_shift_56(sp_digit* r, const sp_digit* a)
{
    int i;
    sp_int64 n = a[54] >> 24;
    n += ((sp_int64)a[55]) << 4;
    for (i = 0; i < 48; i += 8) {
        r[i + 0] = n & 0xfffffff;
        n >>= 28; n += ((sp_int64)a[i + 56]) << 4;
        r[i + 1] = n & 0xfffffff;
        n >>= 28; n += ((sp_int64)a[i + 57]) << 4;
        r[i + 2] = n & 0xfffffff;
        n >>= 28; n += ((sp_int64)a[i + 58]) << 4;
        r[i + 3] = n & 0xfffffff;
        n >>= 28; n += ((sp_int64)a[i + 59]) << 4;
        r[i + 4] = n & 0xfffffff;
        n >>= 28; n += ((sp_int64)a[i + 60]) << 4;
        r[i + 5] = n & 0xfffffff;
        n >>= 28; n += ((sp_int64)a[i + 61]) << 4;
        r[i + 6] = n & 0xfffffff;
        n >>= 28; n += ((sp_int64)a[i + 62]) << 4;
        r[i + 7] = n & 0xfffffff;
        n >>= 28; n += ((sp_int64)a[i + 63]) << 4;
    }
    r[48] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[104]) << 4;
    r[49] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[105]) << 4;
    r[50] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[106]) << 4;
    r[51] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[107]) << 4;
    r[52] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[108]) << 4;
    r[53] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[109]) << 4;
    r[54] = (sp_digit)n;
    XMEMSET(&r[55], 0, sizeof(*r) * 55U);
}

/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_3072_mont_reduce_56(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;
    sp_digit over;

    sp_3072_norm_56(a + 55);

    for (i=0; i<54; i++) {
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0xfffffff;
        sp_3072_mul_add_56(a+i, m, mu);
        a[i+1] += a[i] >> 28;
    }
    mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0xffffffL;
    sp_3072_mul_add_56(a+i, m, mu);
    a[i+1] += a[i] >> 28;
    a[i] &= 0xfffffff;
    sp_3072_mont_shift_56(a, a);
    over = a[54] - m[54];
    sp_3072_cond_sub_56(a, a, m, ~((over - 1) >> 31));
    sp_3072_norm_56(a);
}

/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_3072_mont_mul_56(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_3072_mul_56(r, a, b);
    sp_3072_mont_reduce_56(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_3072_mont_sqr_56(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_3072_sqr_56(r, a);
    sp_3072_mont_reduce_56(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_3072_mul_d_56(sp_digit* r, const sp_digit* a,
    sp_digit b)
{
    sp_int64 tb = b;
    sp_int64 t = 0;
    sp_digit t2;
    sp_int64 p[4];
    int i;

    for (i = 0; i < 56; i += 4) {
        p[0] = tb * a[i + 0];
        p[1] = tb * a[i + 1];
        p[2] = tb * a[i + 2];
        p[3] = tb * a[i + 3];
        t += p[0];
        t2 = (sp_digit)(t & 0xfffffff);
        t >>= 28;
        r[i + 0] = (sp_digit)t2;
        t += p[1];
        t2 = (sp_digit)(t & 0xfffffff);
        t >>= 28;
        r[i + 1] = (sp_digit)t2;
        t += p[2];
        t2 = (sp_digit)(t & 0xfffffff);
        t >>= 28;
        r[i + 2] = (sp_digit)t2;
        t += p[3];
        t2 = (sp_digit)(t & 0xfffffff);
        t >>= 28;
        r[i + 3] = (sp_digit)t2;
    }
    r[56] = (sp_digit)(t & 0xfffffff);
}

#ifndef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_3072_cond_add_56(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 56; i += 8) {
        r[i + 0] = a[i + 0] + (b[i + 0] & m);
        r[i + 1] = a[i + 1] + (b[i + 1] & m);
        r[i + 2] = a[i + 2] + (b[i + 2] & m);
        r[i + 3] = a[i + 3] + (b[i + 3] & m);
        r[i + 4] = a[i + 4] + (b[i + 4] & m);
        r[i + 5] = a[i + 5] + (b[i + 5] & m);
        r[i + 6] = a[i + 6] + (b[i + 6] & m);
        r[i + 7] = a[i + 7] + (b[i + 7] & m);
    }
}
#endif /* !WOLFSSL_SP_SMALL */

SP_NOINLINE static void sp_3072_rshift_56(sp_digit* r, const sp_digit* a,
        byte n)
{
    int i;

    for (i=0; i<48; i += 8) {
        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (28 - n)) & 0xfffffff);
        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (28 - n)) & 0xfffffff);
        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (28 - n)) & 0xfffffff);
        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (28 - n)) & 0xfffffff);
        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (28 - n)) & 0xfffffff);
        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (28 - n)) & 0xfffffff);
        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (28 - n)) & 0xfffffff);
        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (28 - n)) & 0xfffffff);
    }
    r[48] = (a[48] >> n) | ((a[49] << (28 - n)) & 0xfffffff);
    r[49] = (a[49] >> n) | ((a[50] << (28 - n)) & 0xfffffff);
    r[50] = (a[50] >> n) | ((a[51] << (28 - n)) & 0xfffffff);
    r[51] = (a[51] >> n) | ((a[52] << (28 - n)) & 0xfffffff);
    r[52] = (a[52] >> n) | ((a[53] << (28 - n)) & 0xfffffff);
    r[53] = (a[53] >> n) | ((a[54] << (28 - n)) & 0xfffffff);
    r[54] = (a[54] >> n) | ((a[55] << (28 - n)) & 0xfffffff);
    r[55] = a[55] >> n;
}

static WC_INLINE sp_digit sp_3072_div_word_56(sp_digit d1, sp_digit d0,
    sp_digit div)
{
#ifdef SP_USE_DIVTI3
    sp_int64 d = ((sp_int64)d1 << 28) + d0;

    return d / div;
#elif defined(__x86_64__) || defined(__i386__)
    sp_int64 d = ((sp_int64)d1 << 28) + d0;
    sp_uint32 lo = (sp_uint32)d;
    sp_digit hi = (sp_digit)(d >> 32);

    __asm__ __volatile__ (
        "idiv %2"
        : "+a" (lo)
        : "d" (hi), "r" (div)
        : "cc"
    );

    return (sp_digit)lo;
#elif !defined(__aarch64__) &&  !defined(SP_DIV_WORD_USE_DIV)
    sp_int64 d = ((sp_int64)d1 << 28) + d0;
    sp_digit dv = (div >> 1) + 1;
    sp_digit t1 = (sp_digit)(d >> 28);
    sp_digit t0 = (sp_digit)(d & 0xfffffff);
    sp_digit t2;
    sp_digit sign;
    sp_digit r;
    int i;
    sp_int64 m;

    r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
    t1 -= dv & (0 - r);
    for (i = 26; i >= 1; i--) {
        t1 += t1 + (((sp_uint32)t0 >> 27) & 1);
        t0 <<= 1;
        t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
        r += r + t2;
        t1 -= dv & (0 - t2);
        t1 += t2;
    }
    r += r + 1;

    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 28);
    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 56) - (sp_digit)(d >> 56);

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;
   return r;
#else
    sp_int64 d = ((sp_int64)d1 << 28) + d0;
    sp_digit r = 0;
    sp_digit t;
    sp_digit dv = (div >> 13) + 1;

    t = (sp_digit)(d >> 26);
    t = (t / dv) << 13;
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)(d >> 11);
    t = t / (dv << 2);
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)d;
    t = t / div;
    r += t;
    d -= (sp_int64)t * div;
    return r;
#endif
}
static WC_INLINE sp_digit sp_3072_word_div_word_56(sp_digit d, sp_digit div)
{
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \
    defined(SP_DIV_WORD_USE_DIV)
    return d / div;
#else
    return (sp_digit)((sp_uint32)(div - d) >> 31);
#endif
}
/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * Full implementation.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_3072_div_56(const sp_digit* a, const sp_digit* d,
        const sp_digit* m, sp_digit* r)
{
    int i;
#ifndef WOLFSSL_SP_DIV_32
#endif
    sp_digit dv;
    sp_digit r1;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* t1 = NULL;
#else
    sp_digit t1[4 * 56 + 3];
#endif
    sp_digit* t2 = NULL;
    sp_digit* sd = NULL;
    int err = MP_OKAY;

    (void)m;

#ifdef WOLFSSL_SP_SMALL_STACK
    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 56 + 3), NULL,
                                                       DYNAMIC_TYPE_TMP_BUFFER);
    if (t1 == NULL)
        err = MEMORY_E;
#endif

    (void)m;

    if (err == MP_OKAY) {
        t2 = t1 + 112 + 1;
        sd = t2 + 56 + 1;

        sp_3072_mul_d_56(sd, d, (sp_digit)1 << 4);
        sp_3072_mul_d_112(t1, a, (sp_digit)1 << 4);
        dv = sd[54];
        t1[55 + 55] += t1[55 + 55 - 1] >> 28;
        t1[55 + 55 - 1] &= 0xfffffff;
        for (i=55; i>=0; i--) {
            r1 = sp_3072_div_word_56(t1[55 + i], t1[55 + i - 1], dv);

            sp_3072_mul_d_56(t2, sd, r1);
            (void)sp_3072_sub_56(&t1[i], &t1[i], t2);
            sp_3072_norm_55(&t1[i]);
            t1[55 + i] += t1[55 + i - 1] >> 28;
            t1[55 + i - 1] &= 0xfffffff;
            r1 = sp_3072_div_word_56(-t1[55 + i], -t1[55 + i - 1], dv);
            r1 -= t1[55 + i];
            sp_3072_mul_d_56(t2, sd, r1);
            (void)sp_3072_add_56(&t1[i], &t1[i], t2);
            t1[55 + i] += t1[55 + i - 1] >> 28;
            t1[55 + i - 1] &= 0xfffffff;
        }
        t1[55 - 1] += t1[55 - 2] >> 28;
        t1[55 - 2] &= 0xfffffff;
        r1 = sp_3072_word_div_word_56(t1[55 - 1], dv);

        sp_3072_mul_d_56(t2, sd, r1);
        sp_3072_sub_56(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 112U);
        for (i=0; i<54; i++) {
            r[i+1] += r[i] >> 28;
            r[i] &= 0xfffffff;
        }
        sp_3072_cond_add_56(r, r, sd, r[54] >> 31);

        sp_3072_norm_55(r);
        sp_3072_rshift_56(r, r, 4);
        r[55] = 0;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t1 != NULL)
        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_3072_mod_56(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_3072_div_56(a, m, NULL, r);
}

/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_3072_mod_exp_56(sp_digit* r, const sp_digit* a, const sp_digit* e,
    int bits, const sp_digit* m, int reduceA)
{
#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 112];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 56 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 56 * 2);
            XMEMSET(t[i], 0, sizeof(sp_digit) * 56U * 2U);
        }

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_56(norm, m);

        if (reduceA != 0) {
            err = sp_3072_mod_56(t[1], a, m);
        }
        else {
            XMEMCPY(t[1], a, sizeof(sp_digit) * 56U);
        }
    }
    if (err == MP_OKAY) {
        sp_3072_mul_56(t[1], t[1], norm);
        err = sp_3072_mod_56(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 28;
        c = bits % 28;
        n = e[i--] << (28 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 28;
            }

            y = (int)((n >> 27) & 1);
            n <<= 1;

            sp_3072_mont_mul_56(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 56 * 2);
            sp_3072_mont_sqr_56(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 56 * 2);
        }

        sp_3072_mont_reduce_56(t[0], m, mp);
        n = sp_3072_cmp_56(t[0], m);
        sp_3072_cond_sub_56(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 56 * 2);

    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#elif !defined(WC_NO_CACHE_RESISTANT)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 112];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 56 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 56 * 2);
        }

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_56(norm, m);

        if (reduceA != 0) {
            err = sp_3072_mod_56(t[1], a, m);
            if (err == MP_OKAY) {
                sp_3072_mul_56(t[1], t[1], norm);
                err = sp_3072_mod_56(t[1], t[1], m);
            }
        }
        else {
            sp_3072_mul_56(t[1], a, norm);
            err = sp_3072_mod_56(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 28;
        c = bits % 28;
        n = e[i--] << (28 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 28;
            }

            y = (int)((n >> 27) & 1);
            n <<= 1;

            sp_3072_mont_mul_56(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 56 * 2);
            sp_3072_mont_sqr_56(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 56 * 2);
        }

        sp_3072_mont_reduce_56(t[0], m, mp);
        n = sp_3072_cmp_56(t[0], m);
        sp_3072_cond_sub_56(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 56 * 2);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[(32 * 112) + 112];
#endif
    sp_digit* t[32];
    sp_digit* rt = NULL;
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 112) + 112), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<32; i++)
            t[i] = td + i * 112;
        rt = td + 3584;

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_56(norm, m);

        if (reduceA != 0) {
            err = sp_3072_mod_56(t[1], a, m);
            if (err == MP_OKAY) {
                sp_3072_mul_56(t[1], t[1], norm);
                err = sp_3072_mod_56(t[1], t[1], m);
            }
        }
        else {
            sp_3072_mul_56(t[1], a, norm);
            err = sp_3072_mod_56(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_3072_mont_sqr_56(t[ 2], t[ 1], m, mp);
        sp_3072_mont_mul_56(t[ 3], t[ 2], t[ 1], m, mp);
        sp_3072_mont_sqr_56(t[ 4], t[ 2], m, mp);
        sp_3072_mont_mul_56(t[ 5], t[ 3], t[ 2], m, mp);
        sp_3072_mont_sqr_56(t[ 6], t[ 3], m, mp);
        sp_3072_mont_mul_56(t[ 7], t[ 4], t[ 3], m, mp);
        sp_3072_mont_sqr_56(t[ 8], t[ 4], m, mp);
        sp_3072_mont_mul_56(t[ 9], t[ 5], t[ 4], m, mp);
        sp_3072_mont_sqr_56(t[10], t[ 5], m, mp);
        sp_3072_mont_mul_56(t[11], t[ 6], t[ 5], m, mp);
        sp_3072_mont_sqr_56(t[12], t[ 6], m, mp);
        sp_3072_mont_mul_56(t[13], t[ 7], t[ 6], m, mp);
        sp_3072_mont_sqr_56(t[14], t[ 7], m, mp);
        sp_3072_mont_mul_56(t[15], t[ 8], t[ 7], m, mp);
        sp_3072_mont_sqr_56(t[16], t[ 8], m, mp);
        sp_3072_mont_mul_56(t[17], t[ 9], t[ 8], m, mp);
        sp_3072_mont_sqr_56(t[18], t[ 9], m, mp);
        sp_3072_mont_mul_56(t[19], t[10], t[ 9], m, mp);
        sp_3072_mont_sqr_56(t[20], t[10], m, mp);
        sp_3072_mont_mul_56(t[21], t[11], t[10], m, mp);
        sp_3072_mont_sqr_56(t[22], t[11], m, mp);
        sp_3072_mont_mul_56(t[23], t[12], t[11], m, mp);
        sp_3072_mont_sqr_56(t[24], t[12], m, mp);
        sp_3072_mont_mul_56(t[25], t[13], t[12], m, mp);
        sp_3072_mont_sqr_56(t[26], t[13], m, mp);
        sp_3072_mont_mul_56(t[27], t[14], t[13], m, mp);
        sp_3072_mont_sqr_56(t[28], t[14], m, mp);
        sp_3072_mont_mul_56(t[29], t[15], t[14], m, mp);
        sp_3072_mont_sqr_56(t[30], t[15], m, mp);
        sp_3072_mont_mul_56(t[31], t[16], t[15], m, mp);

        bits = ((bits + 4) / 5) * 5;
        i = ((bits + 27) / 28) - 1;
        c = bits % 28;
        if (c == 0) {
            c = 28;
        }
        if (i < 56) {
            n = e[i--] << (32 - c);
        }
        else {
            n = 0;
            i--;
        }
        if (c < 5) {
            n |= e[i--] << (4 - c);
            c += 28;
        }
        y = (int)((n >> 27) & 0x1f);
        n <<= 5;
        c -= 5;
        XMEMCPY(rt, t[y], sizeof(sp_digit) * 112);
        while ((i >= 0) || (c >= 5)) {
            if (c >= 5) {
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c -= 5;
            }
            else if (c == 0) {
                n = e[i--] << 4;
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c = 23;
            }
            else {
                y = (byte)((n >> 27) & 0x1f);
                n = e[i--] << 4;
                c = 5 - c;
                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
                n <<= c;
                c = 28 - c;
            }

            sp_3072_mont_sqr_56(rt, rt, m, mp);
            sp_3072_mont_sqr_56(rt, rt, m, mp);
            sp_3072_mont_sqr_56(rt, rt, m, mp);
            sp_3072_mont_sqr_56(rt, rt, m, mp);
            sp_3072_mont_sqr_56(rt, rt, m, mp);

            sp_3072_mont_mul_56(rt, rt, t[y], m, mp);
        }

        sp_3072_mont_reduce_56(rt, m, mp);
        n = sp_3072_cmp_56(rt, m);
        sp_3072_cond_sub_56(rt, rt, m, ~(n >> 31));
        XMEMCPY(r, rt, sizeof(sp_digit) * 112);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}

#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) | WOLFSSL_HAVE_SP_DH */

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 3072 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_3072_mont_norm_112(sp_digit* r, const sp_digit* m)
{
    /* Set r = 2^n - 1. */
    int i;

    for (i = 0; i < 104; i += 8) {
        r[i + 0] = 0xfffffff;
        r[i + 1] = 0xfffffff;
        r[i + 2] = 0xfffffff;
        r[i + 3] = 0xfffffff;
        r[i + 4] = 0xfffffff;
        r[i + 5] = 0xfffffff;
        r[i + 6] = 0xfffffff;
        r[i + 7] = 0xfffffff;
    }
    r[104] = 0xfffffff;
    r[105] = 0xfffffff;
    r[106] = 0xfffffff;
    r[107] = 0xfffffff;
    r[108] = 0xfffffff;
    r[109] = 0xfffffL;
    r[110] = 0;
    r[111] = 0;

    /* r = (2^n - 1) mod n */
    (void)sp_3072_sub_112(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_3072_cmp_112(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
    int i;

    for (i = 104; i >= 0; i -= 8) {
        r |= (a[i + 7] - b[i + 7]) & ~(((sp_digit)0 - r) >> 27);
        r |= (a[i + 6] - b[i + 6]) & ~(((sp_digit)0 - r) >> 27);
        r |= (a[i + 5] - b[i + 5]) & ~(((sp_digit)0 - r) >> 27);
        r |= (a[i + 4] - b[i + 4]) & ~(((sp_digit)0 - r) >> 27);
        r |= (a[i + 3] - b[i + 3]) & ~(((sp_digit)0 - r) >> 27);
        r |= (a[i + 2] - b[i + 2]) & ~(((sp_digit)0 - r) >> 27);
        r |= (a[i + 1] - b[i + 1]) & ~(((sp_digit)0 - r) >> 27);
        r |= (a[i + 0] - b[i + 0]) & ~(((sp_digit)0 - r) >> 27);
    }

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_3072_cond_sub_112(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 112; i += 8) {
        r[i + 0] = a[i + 0] - (b[i + 0] & m);
        r[i + 1] = a[i + 1] - (b[i + 1] & m);
        r[i + 2] = a[i + 2] - (b[i + 2] & m);
        r[i + 3] = a[i + 3] - (b[i + 3] & m);
        r[i + 4] = a[i + 4] - (b[i + 4] & m);
        r[i + 5] = a[i + 5] - (b[i + 5] & m);
        r[i + 6] = a[i + 6] - (b[i + 6] & m);
        r[i + 7] = a[i + 7] - (b[i + 7] & m);
    }
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_3072_mul_add_112(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifndef WOLFSSL_SP_LARGE_CODE
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 112; i++) {
        t += r[i];
        t += tb * a[i];
        r[i] = ((sp_digit)t) & 0xfffffff;
        t >>= 28;
    }
    r[112] += (sp_digit)t;
#else
    sp_int64 tb = b;
    sp_int64 t[8];
    int i;

    t[0] = 0;
    for (i = 0; i < 104; i += 8) {
        t[0] += (tb * a[i+0]) + r[i+0];
        t[1]  = (tb * a[i+1]) + r[i+1];
        t[2]  = (tb * a[i+2]) + r[i+2];
        t[3]  = (tb * a[i+3]) + r[i+3];
        t[4]  = (tb * a[i+4]) + r[i+4];
        t[5]  = (tb * a[i+5]) + r[i+5];
        t[6]  = (tb * a[i+6]) + r[i+6];
        t[7]  = (tb * a[i+7]) + r[i+7];
        r[i+0] = t[0] & 0xfffffff;
        t[1] += t[0] >> 28;
        r[i+1] = t[1] & 0xfffffff;
        t[2] += t[1] >> 28;
        r[i+2] = t[2] & 0xfffffff;
        t[3] += t[2] >> 28;
        r[i+3] = t[3] & 0xfffffff;
        t[4] += t[3] >> 28;
        r[i+4] = t[4] & 0xfffffff;
        t[5] += t[4] >> 28;
        r[i+5] = t[5] & 0xfffffff;
        t[6] += t[5] >> 28;
        r[i+6] = t[6] & 0xfffffff;
        t[7] += t[6] >> 28;
        r[i+7] = t[7] & 0xfffffff;
        t[0]  = t[7] >> 28;
    }
    t[0] += (tb * a[104]) + r[104];
    t[1]  = (tb * a[105]) + r[105];
    t[2]  = (tb * a[106]) + r[106];
    t[3]  = (tb * a[107]) + r[107];
    t[4]  = (tb * a[108]) + r[108];
    t[5]  = (tb * a[109]) + r[109];
    t[6]  = (tb * a[110]) + r[110];
    t[7]  = (tb * a[111]) + r[111];
    r[104] = t[0] & 0xfffffff;
    t[1] += t[0] >> 28;
    r[105] = t[1] & 0xfffffff;
    t[2] += t[1] >> 28;
    r[106] = t[2] & 0xfffffff;
    t[3] += t[2] >> 28;
    r[107] = t[3] & 0xfffffff;
    t[4] += t[3] >> 28;
    r[108] = t[4] & 0xfffffff;
    t[5] += t[4] >> 28;
    r[109] = t[5] & 0xfffffff;
    t[6] += t[5] >> 28;
    r[110] = t[6] & 0xfffffff;
    t[7] += t[6] >> 28;
    r[111] = t[7] & 0xfffffff;
    r[112] +=  (sp_digit)(t[7] >> 28);
#endif /* !WOLFSSL_SP_LARGE_CODE */
}

/* Shift the result in the high 3072 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_3072_mont_shift_112(sp_digit* r, const sp_digit* a)
{
    int i;
    sp_int64 n = a[109] >> 20;
    n += ((sp_int64)a[110]) << 8;
    for (i = 0; i < 104; i += 8) {
        r[i + 0] = n & 0xfffffff;
        n >>= 28; n += ((sp_int64)a[i + 111]) << 8;
        r[i + 1] = n & 0xfffffff;
        n >>= 28; n += ((sp_int64)a[i + 112]) << 8;
        r[i + 2] = n & 0xfffffff;
        n >>= 28; n += ((sp_int64)a[i + 113]) << 8;
        r[i + 3] = n & 0xfffffff;
        n >>= 28; n += ((sp_int64)a[i + 114]) << 8;
        r[i + 4] = n & 0xfffffff;
        n >>= 28; n += ((sp_int64)a[i + 115]) << 8;
        r[i + 5] = n & 0xfffffff;
        n >>= 28; n += ((sp_int64)a[i + 116]) << 8;
        r[i + 6] = n & 0xfffffff;
        n >>= 28; n += ((sp_int64)a[i + 117]) << 8;
        r[i + 7] = n & 0xfffffff;
        n >>= 28; n += ((sp_int64)a[i + 118]) << 8;
    }
    r[104] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[215]) << 8;
    r[105] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[216]) << 8;
    r[106] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[217]) << 8;
    r[107] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[218]) << 8;
    r[108] = n & 0xfffffff; n >>= 28; n += ((sp_int64)a[219]) << 8;
    r[109] = (sp_digit)n;
    XMEMSET(&r[110], 0, sizeof(*r) * 110U);
}

/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_3072_mont_reduce_112(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;
    sp_digit over;

    sp_3072_norm_112(a + 110);

#ifdef WOLFSSL_SP_DH
    if (mp != 1) {
        for (i=0; i<109; i++) {
            mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0xfffffff;
            sp_3072_mul_add_112(a+i, m, mu);
            a[i+1] += a[i] >> 28;
        }
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0xfffffL;
        sp_3072_mul_add_112(a+i, m, mu);
        a[i+1] += a[i] >> 28;
        a[i] &= 0xfffffff;
    }
    else {
        for (i=0; i<109; i++) {
            mu = a[i] & 0xfffffff;
            sp_3072_mul_add_112(a+i, m, mu);
            a[i+1] += a[i] >> 28;
        }
        mu = a[i] & 0xfffffL;
        sp_3072_mul_add_112(a+i, m, mu);
        a[i+1] += a[i] >> 28;
        a[i] &= 0xfffffff;
    }
#else
    for (i=0; i<109; i++) {
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0xfffffff;
        sp_3072_mul_add_112(a+i, m, mu);
        a[i+1] += a[i] >> 28;
    }
    mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0xfffffL;
    sp_3072_mul_add_112(a+i, m, mu);
    a[i+1] += a[i] >> 28;
    a[i] &= 0xfffffff;
#endif
    sp_3072_mont_shift_112(a, a);
    over = a[109] - m[109];
    sp_3072_cond_sub_112(a, a, m, ~((over - 1) >> 31));
    sp_3072_norm_112(a);
}

/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_3072_mont_mul_112(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_3072_mul_112(r, a, b);
    sp_3072_mont_reduce_112(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_3072_mont_sqr_112(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_3072_sqr_112(r, a);
    sp_3072_mont_reduce_112(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_3072_mul_d_224(sp_digit* r, const sp_digit* a,
    sp_digit b)
{
    sp_int64 tb = b;
    sp_int64 t = 0;
    sp_digit t2;
    sp_int64 p[4];
    int i;

    for (i = 0; i < 224; i += 4) {
        p[0] = tb * a[i + 0];
        p[1] = tb * a[i + 1];
        p[2] = tb * a[i + 2];
        p[3] = tb * a[i + 3];
        t += p[0];
        t2 = (sp_digit)(t & 0xfffffff);
        t >>= 28;
        r[i + 0] = (sp_digit)t2;
        t += p[1];
        t2 = (sp_digit)(t & 0xfffffff);
        t >>= 28;
        r[i + 1] = (sp_digit)t2;
        t += p[2];
        t2 = (sp_digit)(t & 0xfffffff);
        t >>= 28;
        r[i + 2] = (sp_digit)t2;
        t += p[3];
        t2 = (sp_digit)(t & 0xfffffff);
        t >>= 28;
        r[i + 3] = (sp_digit)t2;
    }
    r[224] = (sp_digit)(t & 0xfffffff);
}

#ifndef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_3072_cond_add_112(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 112; i += 8) {
        r[i + 0] = a[i + 0] + (b[i + 0] & m);
        r[i + 1] = a[i + 1] + (b[i + 1] & m);
        r[i + 2] = a[i + 2] + (b[i + 2] & m);
        r[i + 3] = a[i + 3] + (b[i + 3] & m);
        r[i + 4] = a[i + 4] + (b[i + 4] & m);
        r[i + 5] = a[i + 5] + (b[i + 5] & m);
        r[i + 6] = a[i + 6] + (b[i + 6] & m);
        r[i + 7] = a[i + 7] + (b[i + 7] & m);
    }
}
#endif /* !WOLFSSL_SP_SMALL */

SP_NOINLINE static void sp_3072_rshift_112(sp_digit* r, const sp_digit* a,
        byte n)
{
    int i;

    for (i=0; i<104; i += 8) {
        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (28 - n)) & 0xfffffff);
        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (28 - n)) & 0xfffffff);
        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (28 - n)) & 0xfffffff);
        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (28 - n)) & 0xfffffff);
        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (28 - n)) & 0xfffffff);
        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (28 - n)) & 0xfffffff);
        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (28 - n)) & 0xfffffff);
        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (28 - n)) & 0xfffffff);
    }
    r[104] = (a[104] >> n) | ((a[105] << (28 - n)) & 0xfffffff);
    r[105] = (a[105] >> n) | ((a[106] << (28 - n)) & 0xfffffff);
    r[106] = (a[106] >> n) | ((a[107] << (28 - n)) & 0xfffffff);
    r[107] = (a[107] >> n) | ((a[108] << (28 - n)) & 0xfffffff);
    r[108] = (a[108] >> n) | ((a[109] << (28 - n)) & 0xfffffff);
    r[109] = (a[109] >> n) | ((a[110] << (28 - n)) & 0xfffffff);
    r[110] = (a[110] >> n) | ((a[111] << (28 - n)) & 0xfffffff);
    r[111] = a[111] >> n;
}

static WC_INLINE sp_digit sp_3072_div_word_112(sp_digit d1, sp_digit d0,
    sp_digit div)
{
#ifdef SP_USE_DIVTI3
    sp_int64 d = ((sp_int64)d1 << 28) + d0;

    return d / div;
#elif defined(__x86_64__) || defined(__i386__)
    sp_int64 d = ((sp_int64)d1 << 28) + d0;
    sp_uint32 lo = (sp_uint32)d;
    sp_digit hi = (sp_digit)(d >> 32);

    __asm__ __volatile__ (
        "idiv %2"
        : "+a" (lo)
        : "d" (hi), "r" (div)
        : "cc"
    );

    return (sp_digit)lo;
#elif !defined(__aarch64__) &&  !defined(SP_DIV_WORD_USE_DIV)
    sp_int64 d = ((sp_int64)d1 << 28) + d0;
    sp_digit dv = (div >> 1) + 1;
    sp_digit t1 = (sp_digit)(d >> 28);
    sp_digit t0 = (sp_digit)(d & 0xfffffff);
    sp_digit t2;
    sp_digit sign;
    sp_digit r;
    int i;
    sp_int64 m;

    r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
    t1 -= dv & (0 - r);
    for (i = 26; i >= 1; i--) {
        t1 += t1 + (((sp_uint32)t0 >> 27) & 1);
        t0 <<= 1;
        t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
        r += r + t2;
        t1 -= dv & (0 - t2);
        t1 += t2;
    }
    r += r + 1;

    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 28);
    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 56) - (sp_digit)(d >> 56);

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;
   return r;
#else
    sp_int64 d = ((sp_int64)d1 << 28) + d0;
    sp_digit r = 0;
    sp_digit t;
    sp_digit dv = (div >> 13) + 1;

    t = (sp_digit)(d >> 26);
    t = (t / dv) << 13;
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)(d >> 11);
    t = t / (dv << 2);
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)d;
    t = t / div;
    r += t;
    d -= (sp_int64)t * div;
    return r;
#endif
}
static WC_INLINE sp_digit sp_3072_word_div_word_112(sp_digit d, sp_digit div)
{
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \
    defined(SP_DIV_WORD_USE_DIV)
    return d / div;
#else
    return (sp_digit)((sp_uint32)(div - d) >> 31);
#endif
}
/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * Full implementation.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_3072_div_112(const sp_digit* a, const sp_digit* d,
        const sp_digit* m, sp_digit* r)
{
    int i;
#ifndef WOLFSSL_SP_DIV_32
#endif
    sp_digit dv;
    sp_digit r1;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* t1 = NULL;
#else
    sp_digit t1[4 * 112 + 3];
#endif
    sp_digit* t2 = NULL;
    sp_digit* sd = NULL;
    int err = MP_OKAY;

    (void)m;

#ifdef WOLFSSL_SP_SMALL_STACK
    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 112 + 3), NULL,
                                                       DYNAMIC_TYPE_TMP_BUFFER);
    if (t1 == NULL)
        err = MEMORY_E;
#endif

    (void)m;

    if (err == MP_OKAY) {
        t2 = t1 + 224 + 1;
        sd = t2 + 112 + 1;

        sp_3072_mul_d_112(sd, d, (sp_digit)1 << 8);
        sp_3072_mul_d_224(t1, a, (sp_digit)1 << 8);
        dv = sd[109];
        t1[110 + 110] += t1[110 + 110 - 1] >> 28;
        t1[110 + 110 - 1] &= 0xfffffff;
        for (i=110; i>=0; i--) {
            r1 = sp_3072_div_word_112(t1[110 + i], t1[110 + i - 1], dv);

            sp_3072_mul_d_112(t2, sd, r1);
            (void)sp_3072_sub_112(&t1[i], &t1[i], t2);
            sp_3072_norm_110(&t1[i]);
            t1[110 + i] += t1[110 + i - 1] >> 28;
            t1[110 + i - 1] &= 0xfffffff;
            r1 = sp_3072_div_word_112(-t1[110 + i], -t1[110 + i - 1], dv);
            r1 -= t1[110 + i];
            sp_3072_mul_d_112(t2, sd, r1);
            (void)sp_3072_add_112(&t1[i], &t1[i], t2);
            t1[110 + i] += t1[110 + i - 1] >> 28;
            t1[110 + i - 1] &= 0xfffffff;
        }
        t1[110 - 1] += t1[110 - 2] >> 28;
        t1[110 - 2] &= 0xfffffff;
        r1 = sp_3072_word_div_word_112(t1[110 - 1], dv);

        sp_3072_mul_d_112(t2, sd, r1);
        sp_3072_sub_112(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 224U);
        for (i=0; i<109; i++) {
            r[i+1] += r[i] >> 28;
            r[i] &= 0xfffffff;
        }
        sp_3072_cond_add_112(r, r, sd, r[109] >> 31);

        sp_3072_norm_110(r);
        sp_3072_rshift_112(r, r, 8);
        r[110] = 0;
        r[111] = 0;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t1 != NULL)
        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_3072_mod_112(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_3072_div_112(a, m, NULL, r);
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
                                                     defined(WOLFSSL_HAVE_SP_DH)
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_3072_mod_exp_112(sp_digit* r, const sp_digit* a, const sp_digit* e,
    int bits, const sp_digit* m, int reduceA)
{
#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 224];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 112 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 112 * 2);
            XMEMSET(t[i], 0, sizeof(sp_digit) * 112U * 2U);
        }

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_112(norm, m);

        if (reduceA != 0) {
            err = sp_3072_mod_112(t[1], a, m);
        }
        else {
            XMEMCPY(t[1], a, sizeof(sp_digit) * 112U);
        }
    }
    if (err == MP_OKAY) {
        sp_3072_mul_112(t[1], t[1], norm);
        err = sp_3072_mod_112(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 28;
        c = bits % 28;
        n = e[i--] << (28 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 28;
            }

            y = (int)((n >> 27) & 1);
            n <<= 1;

            sp_3072_mont_mul_112(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 112 * 2);
            sp_3072_mont_sqr_112(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 112 * 2);
        }

        sp_3072_mont_reduce_112(t[0], m, mp);
        n = sp_3072_cmp_112(t[0], m);
        sp_3072_cond_sub_112(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 112 * 2);

    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#elif !defined(WC_NO_CACHE_RESISTANT)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 224];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 112 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 112 * 2);
        }

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_112(norm, m);

        if (reduceA != 0) {
            err = sp_3072_mod_112(t[1], a, m);
            if (err == MP_OKAY) {
                sp_3072_mul_112(t[1], t[1], norm);
                err = sp_3072_mod_112(t[1], t[1], m);
            }
        }
        else {
            sp_3072_mul_112(t[1], a, norm);
            err = sp_3072_mod_112(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 28;
        c = bits % 28;
        n = e[i--] << (28 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 28;
            }

            y = (int)((n >> 27) & 1);
            n <<= 1;

            sp_3072_mont_mul_112(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 112 * 2);
            sp_3072_mont_sqr_112(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 112 * 2);
        }

        sp_3072_mont_reduce_112(t[0], m, mp);
        n = sp_3072_cmp_112(t[0], m);
        sp_3072_cond_sub_112(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 112 * 2);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[(16 * 224) + 224];
#endif
    sp_digit* t[16];
    sp_digit* rt = NULL;
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((16 * 224) + 224), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<16; i++)
            t[i] = td + i * 224;
        rt = td + 3584;

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_112(norm, m);

        if (reduceA != 0) {
            err = sp_3072_mod_112(t[1], a, m);
            if (err == MP_OKAY) {
                sp_3072_mul_112(t[1], t[1], norm);
                err = sp_3072_mod_112(t[1], t[1], m);
            }
        }
        else {
            sp_3072_mul_112(t[1], a, norm);
            err = sp_3072_mod_112(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_3072_mont_sqr_112(t[ 2], t[ 1], m, mp);
        sp_3072_mont_mul_112(t[ 3], t[ 2], t[ 1], m, mp);
        sp_3072_mont_sqr_112(t[ 4], t[ 2], m, mp);
        sp_3072_mont_mul_112(t[ 5], t[ 3], t[ 2], m, mp);
        sp_3072_mont_sqr_112(t[ 6], t[ 3], m, mp);
        sp_3072_mont_mul_112(t[ 7], t[ 4], t[ 3], m, mp);
        sp_3072_mont_sqr_112(t[ 8], t[ 4], m, mp);
        sp_3072_mont_mul_112(t[ 9], t[ 5], t[ 4], m, mp);
        sp_3072_mont_sqr_112(t[10], t[ 5], m, mp);
        sp_3072_mont_mul_112(t[11], t[ 6], t[ 5], m, mp);
        sp_3072_mont_sqr_112(t[12], t[ 6], m, mp);
        sp_3072_mont_mul_112(t[13], t[ 7], t[ 6], m, mp);
        sp_3072_mont_sqr_112(t[14], t[ 7], m, mp);
        sp_3072_mont_mul_112(t[15], t[ 8], t[ 7], m, mp);

        bits = ((bits + 3) / 4) * 4;
        i = ((bits + 27) / 28) - 1;
        c = bits % 28;
        if (c == 0) {
            c = 28;
        }
        if (i < 112) {
            n = e[i--] << (32 - c);
        }
        else {
            n = 0;
            i--;
        }
        if (c < 4) {
            n |= e[i--] << (4 - c);
            c += 28;
        }
        y = (int)((n >> 28) & 0xf);
        n <<= 4;
        c -= 4;
        XMEMCPY(rt, t[y], sizeof(sp_digit) * 224);
        while ((i >= 0) || (c >= 4)) {
            if (c >= 4) {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }
            else if (c == 0) {
                n = e[i--] << 4;
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c = 24;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n = e[i--] << 4;
                c = 4 - c;
                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
                n <<= c;
                c = 28 - c;
            }

            sp_3072_mont_sqr_112(rt, rt, m, mp);
            sp_3072_mont_sqr_112(rt, rt, m, mp);
            sp_3072_mont_sqr_112(rt, rt, m, mp);
            sp_3072_mont_sqr_112(rt, rt, m, mp);

            sp_3072_mont_mul_112(rt, rt, t[y], m, mp);
        }

        sp_3072_mont_reduce_112(rt, m, mp);
        n = sp_3072_cmp_112(rt, m);
        sp_3072_cond_sub_112(rt, rt, m, ~(n >> 31));
        XMEMCPY(r, rt, sizeof(sp_digit) * 224);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}
#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) || */
       /* WOLFSSL_HAVE_SP_DH */

#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
#ifdef WOLFSSL_HAVE_SP_RSA
/* RSA public key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * em      Public exponent.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 384 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPublic_3072(const byte* in, word32 inLen, const mp_int* em,
    const mp_int* mm, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[112 * 5];
#endif
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    sp_digit* norm = NULL;
    sp_uint64 e[1] = {0};
    sp_digit mp = 0;
    int i;
    int err = MP_OKAY;

    if (*outLen < 384U) {
        err = MP_TO_E;
    }

    if (err == MP_OKAY) {
        if (mp_count_bits(em) > 64) {
            err = MP_READ_E;
        }
        else if (inLen > 384U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 3072) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 112 * 5, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        r = a + 112 * 2;
        m = r + 112 * 2;
        norm = r;

        sp_3072_from_bin(a, 112, in, inLen);
#if DIGIT_BIT >= 64
        e[0] = (sp_uint64)em->dp[0];
#else
        e[0] = (sp_uint64)em->dp[0];
        if (em->used > 1) {
            e[0] |= ((sp_uint64)em->dp[1]) << DIGIT_BIT;
        }
#endif
        if (e[0] == 0) {
            err = MP_EXPTMOD_E;
        }
    }

    if (err == MP_OKAY) {
        sp_3072_from_mp(m, 112, mm);

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_112(norm, m);
    }
    if (err == MP_OKAY) {
        sp_3072_mul_112(a, a, norm);
        err = sp_3072_mod_112(a, a, m);
    }
    if (err == MP_OKAY) {
        for (i=63; i>=0; i--) {
            if ((e[0] >> i) != 0) {
                break;
            }
        }

        XMEMCPY(r, a, sizeof(sp_digit) * 112 * 2);
        for (i--; i>=0; i--) {
            sp_3072_mont_sqr_112(r, r, m, mp);

            if (((e[0] >> i) & 1) == 1) {
                sp_3072_mont_mul_112(r, r, a, m, mp);
            }
        }
        sp_3072_mont_reduce_112(r, m, mp);
        mp = sp_3072_cmp_112(r, m);
        sp_3072_cond_sub_112(r, r, m, ~(mp >> 31));

        sp_3072_to_bin_112(r, out);
        *outLen = 384;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (a != NULL)
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
#endif

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit d[112 * 5];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    sp_uint64 e[1] = {0};
    int err = MP_OKAY;

    if (*outLen < 384U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(em) > 64) {
            err = MP_READ_E;
        }
        else if (inLen > 384U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 3072) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 112 * 5, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d;
        r = a + 112 * 2;
        m = r + 112 * 2;

        sp_3072_from_bin(a, 112, in, inLen);
#if DIGIT_BIT >= 64
        e[0] = (sp_uint64)em->dp[0];
#else
        e[0] = (sp_uint64)em->dp[0];
        if (em->used > 1) {
            e[0] |= ((sp_uint64)em->dp[1]) << DIGIT_BIT;
        }
#endif
        if (e[0] == 0) {
            err = MP_EXPTMOD_E;
        }
    }
    if (err == MP_OKAY) {
        sp_3072_from_mp(m, 112, mm);

        if (e[0] == 0x3) {
            sp_3072_sqr_112(r, a);
            err = sp_3072_mod_112(r, r, m);
            if (err == MP_OKAY) {
                sp_3072_mul_112(r, a, r);
                err = sp_3072_mod_112(r, r, m);
            }
        }
        else {
            sp_digit* norm = r;
            int i;
            sp_digit mp;

            sp_3072_mont_setup(m, &mp);
            sp_3072_mont_norm_112(norm, m);

            sp_3072_mul_112(a, a, norm);
            err = sp_3072_mod_112(a, a, m);

            if (err == MP_OKAY) {
                for (i=63; i>=0; i--) {
                    if ((e[0] >> i) != 0) {
                        break;
                    }
                }

                XMEMCPY(r, a, sizeof(sp_digit) * 224U);
                for (i--; i>=0; i--) {
                    sp_3072_mont_sqr_112(r, r, m, mp);

                    if (((e[0] >> i) & 1) == 1) {
                        sp_3072_mont_mul_112(r, r, a, m, mp);
                    }
                }
                sp_3072_mont_reduce_112(r, m, mp);
                mp = sp_3072_cmp_112(r, m);
                sp_3072_cond_sub_112(r, r, m, ~(mp >> 31));
            }
        }
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin_112(r, out);
        *outLen = 384;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif

    return err;
#endif /* WOLFSSL_SP_SMALL */
}

#ifndef WOLFSSL_RSA_PUBLIC_ONLY
#if !defined(SP_RSA_PRIVATE_EXP_D) && !defined(RSA_LOW_MEM)
#endif /* !SP_RSA_PRIVATE_EXP_D & !RSA_LOW_MEM */
/* RSA private key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * dm      Private exponent.
 * pm      First prime.
 * qm      Second prime.
 * dpm     First prime's CRT exponent.
 * dqm     Second prime's CRT exponent.
 * qim     Inverse of second prime mod p.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 384 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPrivate_3072(const byte* in, word32 inLen, const mp_int* dm,
    const mp_int* pm, const mp_int* qm, const mp_int* dpm, const mp_int* dqm,
    const mp_int* qim, const mp_int* mm, byte* out, word32* outLen)
{
#if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM)
#if defined(WOLFSSL_SP_SMALL)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit  d[112 * 4];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 384U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(dm) > 3072) {
           err = MP_READ_E;
        }
        else if (inLen > 384) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 3072) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 112 * 4, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d + 112;
        m = a + 224;
        r = a;

        sp_3072_from_bin(a, 112, in, inLen);
        sp_3072_from_mp(d, 112, dm);
        sp_3072_from_mp(m, 112, mm);
        err = sp_3072_mod_exp_112(r, a, d, 3072, m, 0);
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin_112(r, out);
        *outLen = 384;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
#endif
    {
        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
        if (a != NULL)
            ForceZero(a, sizeof(sp_digit) * 112);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit d[112 * 4];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 384U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(dm) > 3072) {
            err = MP_READ_E;
        }
        else if (inLen > 384U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 3072) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 112 * 4, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d + 112;
        m = a + 224;
        r = a;

        sp_3072_from_bin(a, 112, in, inLen);
        sp_3072_from_mp(d, 112, dm);
        sp_3072_from_mp(m, 112, mm);
        err = sp_3072_mod_exp_112(r, a, d, 3072, m, 0);
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin_112(r, out);
        *outLen = 384;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
#endif
    {
        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
        if (a != NULL)
            ForceZero(a, sizeof(sp_digit) * 112);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#endif /* WOLFSSL_SP_SMALL */
#else
#if defined(WOLFSSL_SP_SMALL)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[56 * 8];
#endif
    sp_digit* p = NULL;
    sp_digit* dp = NULL;
    sp_digit* dq = NULL;
    sp_digit* qi = NULL;
    sp_digit* tmpa = NULL;
    sp_digit* tmpb = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 384U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (inLen > 384) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 3072) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(pm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(qm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 56 * 8, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif
    if (err == MP_OKAY) {
        p = a + 112;
        qi = dq = dp = p + 56;
        tmpa = qi + 56;
        tmpb = tmpa + 112;
        r = a;

        sp_3072_from_bin(a, 112, in, inLen);
        sp_3072_from_mp(p, 56, pm);
        sp_3072_from_mp(dp, 56, dpm);
        err = sp_3072_mod_exp_56(tmpa, a, dp, 1536, p, 1);
    }
    if (err == MP_OKAY) {
        sp_3072_from_mp(p, 56, qm);
        sp_3072_from_mp(dq, 56, dqm);
        err = sp_3072_mod_exp_56(tmpb, a, dq, 1536, p, 1);
    }
    if (err == MP_OKAY) {
        sp_3072_from_mp(p, 56, pm);
        (void)sp_3072_sub_56(tmpa, tmpa, tmpb);
        sp_3072_norm_55(tmpa);
        sp_3072_cond_add_56(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[54] >> 31));
        sp_3072_cond_add_56(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[54] >> 31));
        sp_3072_norm_56(tmpa);

        sp_3072_from_mp(qi, 56, qim);
        sp_3072_mul_56(tmpa, tmpa, qi);
        err = sp_3072_mod_56(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_3072_from_mp(p, 56, qm);
        sp_3072_mul_56(tmpa, p, tmpa);
        (void)sp_3072_add_112(r, tmpb, tmpa);
        sp_3072_norm_112(r);

        sp_3072_to_bin_112(r, out);
        *outLen = 384;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (a != NULL)
#endif
    {
        ForceZero(a, sizeof(sp_digit) * 56 * 8);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[56 * 13];
#endif
    sp_digit* p = NULL;
    sp_digit* q = NULL;
    sp_digit* dp = NULL;
    sp_digit* dq = NULL;
    sp_digit* qi = NULL;
    sp_digit* tmpa = NULL;
    sp_digit* tmpb = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 384U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (inLen > 384U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 3072) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(pm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(qm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 56 * 13, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        p = a + 112 * 2;
        q = p + 56;
        dp = q + 56;
        dq = dp + 56;
        qi = dq + 56;
        tmpa = qi + 56;
        tmpb = tmpa + 112;
        r = a;

        sp_3072_from_bin(a, 112, in, inLen);
        sp_3072_from_mp(p, 56, pm);
        sp_3072_from_mp(q, 56, qm);
        sp_3072_from_mp(dp, 56, dpm);
        sp_3072_from_mp(dq, 56, dqm);
        sp_3072_from_mp(qi, 56, qim);

        err = sp_3072_mod_exp_56(tmpa, a, dp, 1536, p, 1);
    }
    if (err == MP_OKAY) {
        err = sp_3072_mod_exp_56(tmpb, a, dq, 1536, q, 1);
    }

    if (err == MP_OKAY) {
        (void)sp_3072_sub_56(tmpa, tmpa, tmpb);
        sp_3072_norm_55(tmpa);
        sp_3072_cond_add_56(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[54] >> 31));
        sp_3072_cond_add_56(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[54] >> 31));
        sp_3072_norm_56(tmpa);
        sp_3072_mul_56(tmpa, tmpa, qi);
        err = sp_3072_mod_56(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_3072_mul_56(tmpa, tmpa, q);
        (void)sp_3072_add_112(r, tmpb, tmpa);
        sp_3072_norm_112(r);

        sp_3072_to_bin_112(r, out);
        *outLen = 384;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
if (a != NULL)
#endif
    {
        ForceZero(a, sizeof(sp_digit) * 56 * 13);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
    #endif
    }

    return err;
#endif /* WOLFSSL_SP_SMALL */
#endif /* SP_RSA_PRIVATE_EXP_D || RSA_LOW_MEM */
}

#endif /* !WOLFSSL_RSA_PUBLIC_ONLY */
#endif /* WOLFSSL_HAVE_SP_RSA */
#if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \
                                              !defined(WOLFSSL_RSA_PUBLIC_ONLY))
/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_3072_to_mp(const sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (3072 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
#if DIGIT_BIT == 28
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 110);
        r->used = 110;
        mp_clamp(r);
#elif DIGIT_BIT < 28
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 110; i++) {
            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 28) {
                s += DIGIT_BIT;
                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
                if (s == SP_WORD_SIZE) {
                    r->dp[j] = 0;
                }
                else {
                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 28 - s;
        }
        r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 110; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 28 >= DIGIT_BIT) {
    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 28 - s;
            }
            else {
                s += 28;
            }
        }
        r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_3072(const mp_int* base, const mp_int* exp, const mp_int* mod,
    mp_int* res)
{
#ifdef WOLFSSL_SP_SMALL
    int err = MP_OKAY;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[112 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 3072) {
        err = MP_READ_E;
    }
    else if (expBits > 3072) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 112 * 4, NULL,
            DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 112 * 2;
        m = e + 112;
        r = b;

        sp_3072_from_mp(b, 112, base);
        sp_3072_from_mp(e, 112, exp);
        sp_3072_from_mp(m, 112, mod);

        err = sp_3072_mod_exp_112(r, b, e, mp_count_bits(exp), m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_3072_to_mp(r, res);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 112U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }
    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[112 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 3072) {
        err = MP_READ_E;
    }
    else if (expBits > 3072) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 112 * 4, NULL, DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 112 * 2;
        m = e + 112;
        r = b;

        sp_3072_from_mp(b, 112, base);
        sp_3072_from_mp(e, 112, exp);
        sp_3072_from_mp(m, 112, mod);

        err = sp_3072_mod_exp_112(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_3072_to_mp(r, res);
    }


#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 112U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }

    return err;
#endif
}

#ifdef WOLFSSL_HAVE_SP_DH

#ifdef HAVE_FFDHE_3072
SP_NOINLINE static void sp_3072_lshift_112(sp_digit* r, const sp_digit* a,
        byte n)
{
    sp_int_digit s;
    sp_int_digit t;

    s = (sp_int_digit)a[111];
    r[112] = s >> (28U - n);
    s = (sp_int_digit)(a[111]); t = (sp_int_digit)(a[110]);
    r[111] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[110]); t = (sp_int_digit)(a[109]);
    r[110] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[109]); t = (sp_int_digit)(a[108]);
    r[109] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[108]); t = (sp_int_digit)(a[107]);
    r[108] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[107]); t = (sp_int_digit)(a[106]);
    r[107] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[106]); t = (sp_int_digit)(a[105]);
    r[106] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[105]); t = (sp_int_digit)(a[104]);
    r[105] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[104]); t = (sp_int_digit)(a[103]);
    r[104] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[103]); t = (sp_int_digit)(a[102]);
    r[103] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[102]); t = (sp_int_digit)(a[101]);
    r[102] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[101]); t = (sp_int_digit)(a[100]);
    r[101] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[100]); t = (sp_int_digit)(a[99]);
    r[100] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[99]); t = (sp_int_digit)(a[98]);
    r[99] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[98]); t = (sp_int_digit)(a[97]);
    r[98] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[97]); t = (sp_int_digit)(a[96]);
    r[97] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[96]); t = (sp_int_digit)(a[95]);
    r[96] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[95]); t = (sp_int_digit)(a[94]);
    r[95] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[94]); t = (sp_int_digit)(a[93]);
    r[94] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[93]); t = (sp_int_digit)(a[92]);
    r[93] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[92]); t = (sp_int_digit)(a[91]);
    r[92] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[91]); t = (sp_int_digit)(a[90]);
    r[91] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[90]); t = (sp_int_digit)(a[89]);
    r[90] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[89]); t = (sp_int_digit)(a[88]);
    r[89] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[88]); t = (sp_int_digit)(a[87]);
    r[88] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[87]); t = (sp_int_digit)(a[86]);
    r[87] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[86]); t = (sp_int_digit)(a[85]);
    r[86] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[85]); t = (sp_int_digit)(a[84]);
    r[85] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[84]); t = (sp_int_digit)(a[83]);
    r[84] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[83]); t = (sp_int_digit)(a[82]);
    r[83] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[82]); t = (sp_int_digit)(a[81]);
    r[82] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[81]); t = (sp_int_digit)(a[80]);
    r[81] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[80]); t = (sp_int_digit)(a[79]);
    r[80] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[79]); t = (sp_int_digit)(a[78]);
    r[79] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[78]); t = (sp_int_digit)(a[77]);
    r[78] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[77]); t = (sp_int_digit)(a[76]);
    r[77] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[76]); t = (sp_int_digit)(a[75]);
    r[76] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[75]); t = (sp_int_digit)(a[74]);
    r[75] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[74]); t = (sp_int_digit)(a[73]);
    r[74] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[73]); t = (sp_int_digit)(a[72]);
    r[73] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[72]); t = (sp_int_digit)(a[71]);
    r[72] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[71]); t = (sp_int_digit)(a[70]);
    r[71] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[70]); t = (sp_int_digit)(a[69]);
    r[70] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[69]); t = (sp_int_digit)(a[68]);
    r[69] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[68]); t = (sp_int_digit)(a[67]);
    r[68] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[67]); t = (sp_int_digit)(a[66]);
    r[67] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[66]); t = (sp_int_digit)(a[65]);
    r[66] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[65]); t = (sp_int_digit)(a[64]);
    r[65] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[64]); t = (sp_int_digit)(a[63]);
    r[64] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[63]); t = (sp_int_digit)(a[62]);
    r[63] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[62]); t = (sp_int_digit)(a[61]);
    r[62] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[61]); t = (sp_int_digit)(a[60]);
    r[61] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[60]); t = (sp_int_digit)(a[59]);
    r[60] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[59]); t = (sp_int_digit)(a[58]);
    r[59] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[58]); t = (sp_int_digit)(a[57]);
    r[58] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[57]); t = (sp_int_digit)(a[56]);
    r[57] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[56]); t = (sp_int_digit)(a[55]);
    r[56] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[55]); t = (sp_int_digit)(a[54]);
    r[55] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[54]); t = (sp_int_digit)(a[53]);
    r[54] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[53]); t = (sp_int_digit)(a[52]);
    r[53] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[52]); t = (sp_int_digit)(a[51]);
    r[52] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[51]); t = (sp_int_digit)(a[50]);
    r[51] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[50]); t = (sp_int_digit)(a[49]);
    r[50] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[49]); t = (sp_int_digit)(a[48]);
    r[49] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[48]); t = (sp_int_digit)(a[47]);
    r[48] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[47]); t = (sp_int_digit)(a[46]);
    r[47] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[46]); t = (sp_int_digit)(a[45]);
    r[46] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[45]); t = (sp_int_digit)(a[44]);
    r[45] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[44]); t = (sp_int_digit)(a[43]);
    r[44] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[43]); t = (sp_int_digit)(a[42]);
    r[43] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[42]); t = (sp_int_digit)(a[41]);
    r[42] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[41]); t = (sp_int_digit)(a[40]);
    r[41] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[40]); t = (sp_int_digit)(a[39]);
    r[40] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[39]); t = (sp_int_digit)(a[38]);
    r[39] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[38]); t = (sp_int_digit)(a[37]);
    r[38] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[37]); t = (sp_int_digit)(a[36]);
    r[37] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[36]); t = (sp_int_digit)(a[35]);
    r[36] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[35]); t = (sp_int_digit)(a[34]);
    r[35] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[34]); t = (sp_int_digit)(a[33]);
    r[34] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[33]); t = (sp_int_digit)(a[32]);
    r[33] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[32]); t = (sp_int_digit)(a[31]);
    r[32] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[31]); t = (sp_int_digit)(a[30]);
    r[31] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[30]); t = (sp_int_digit)(a[29]);
    r[30] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[29]); t = (sp_int_digit)(a[28]);
    r[29] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[28]); t = (sp_int_digit)(a[27]);
    r[28] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[27]); t = (sp_int_digit)(a[26]);
    r[27] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[26]); t = (sp_int_digit)(a[25]);
    r[26] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[25]); t = (sp_int_digit)(a[24]);
    r[25] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[24]); t = (sp_int_digit)(a[23]);
    r[24] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[23]); t = (sp_int_digit)(a[22]);
    r[23] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[22]); t = (sp_int_digit)(a[21]);
    r[22] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[21]); t = (sp_int_digit)(a[20]);
    r[21] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[20]); t = (sp_int_digit)(a[19]);
    r[20] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[19]); t = (sp_int_digit)(a[18]);
    r[19] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[18]); t = (sp_int_digit)(a[17]);
    r[18] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[17]); t = (sp_int_digit)(a[16]);
    r[17] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[16]); t = (sp_int_digit)(a[15]);
    r[16] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[15]); t = (sp_int_digit)(a[14]);
    r[15] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[14]); t = (sp_int_digit)(a[13]);
    r[14] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[13]); t = (sp_int_digit)(a[12]);
    r[13] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[12]); t = (sp_int_digit)(a[11]);
    r[12] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[11]); t = (sp_int_digit)(a[10]);
    r[11] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[10]); t = (sp_int_digit)(a[9]);
    r[10] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[9]); t = (sp_int_digit)(a[8]);
    r[9] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[8]); t = (sp_int_digit)(a[7]);
    r[8] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[7]); t = (sp_int_digit)(a[6]);
    r[7] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[6]); t = (sp_int_digit)(a[5]);
    r[6] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[5]); t = (sp_int_digit)(a[4]);
    r[5] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[4]); t = (sp_int_digit)(a[3]);
    r[4] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[3]); t = (sp_int_digit)(a[2]);
    r[3] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[2]); t = (sp_int_digit)(a[1]);
    r[2] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    s = (sp_int_digit)(a[1]); t = (sp_int_digit)(a[0]);
    r[1] = ((s << n) | (t >> (28U - n))) & 0xfffffff;
    r[0] = (a[0] << n) & 0xfffffff;
}

/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even.
 */
static int sp_3072_mod_exp_2_112(sp_digit* r, const sp_digit* e, int bits, const sp_digit* m)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[337];
#endif
    sp_digit* norm = NULL;
    sp_digit* tmp = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit o;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 337, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        tmp  = td + 224;
        XMEMSET(td, 0, sizeof(sp_digit) * 337);

        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_112(norm, m);

        bits = ((bits + 3) / 4) * 4;
        i = ((bits + 27) / 28) - 1;
        c = bits % 28;
        if (c == 0) {
            c = 28;
        }
        if (i < 112) {
            n = e[i--] << (32 - c);
        }
        else {
            n = 0;
            i--;
        }
        if (c < 4) {
            n |= e[i--] << (4 - c);
            c += 28;
        }
        y = (int)((n >> 28) & 0xf);
        n <<= 4;
        c -= 4;
        sp_3072_lshift_112(r, norm, (byte)y);
        while ((i >= 0) || (c >= 4)) {
            if (c >= 4) {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }
            else if (c == 0) {
                n = e[i--] << 4;
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c = 24;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n = e[i--] << 4;
                c = 4 - c;
                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
                n <<= c;
                c = 28 - c;
            }

            sp_3072_mont_sqr_112(r, r, m, mp);
            sp_3072_mont_sqr_112(r, r, m, mp);
            sp_3072_mont_sqr_112(r, r, m, mp);
            sp_3072_mont_sqr_112(r, r, m, mp);

            sp_3072_lshift_112(r, r, (byte)y);
            sp_3072_mul_d_112(tmp, norm, (r[110] << 8) + (r[109] >> 20));
            r[110] = 0;
            r[109] &= 0xfffffL;
            (void)sp_3072_add_112(r, r, tmp);
            sp_3072_norm_112(r);
            o = sp_3072_cmp_112(r, m);
            sp_3072_cond_sub_112(r, r, m, ~(o >> 31));
        }

        sp_3072_mont_reduce_112(r, m, mp);
        n = sp_3072_cmp_112(r, m);
        sp_3072_cond_sub_112(r, r, m, ~(n >> 31));
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

#endif /* HAVE_FFDHE_3072 */

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base     Base.
 * exp      Array of bytes that is the exponent.
 * expLen   Length of data, in bytes, in exponent.
 * mod      Modulus.
 * out      Buffer to hold big-endian bytes of exponentiation result.
 *          Must be at least 384 bytes long.
 * outLen   Length, in bytes, of exponentiation result.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_DhExp_3072(const mp_int* base, const byte* exp, word32 expLen,
    const mp_int* mod, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[112 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    word32 i;
    int err = MP_OKAY;

    if (mp_count_bits(base) > 3072) {
        err = MP_READ_E;
    }
    else if (expLen > 384U) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 112 * 4, NULL,
            DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 112 * 2;
        m = e + 112;
        r = b;

        sp_3072_from_mp(b, 112, base);
        sp_3072_from_bin(e, 112, exp, expLen);
        sp_3072_from_mp(m, 112, mod);

    #ifdef HAVE_FFDHE_3072
        if (base->used == 1 && base->dp[0] == 2U &&
                (m[109] >> 4) == 0xffffL) {
            err = sp_3072_mod_exp_2_112(r, e, expLen * 8U, m);
        }
        else {
    #endif
            err = sp_3072_mod_exp_112(r, b, e, expLen * 8U, m, 0);
    #ifdef HAVE_FFDHE_3072
        }
    #endif
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin_112(r, out);
        *outLen = 384;
        for (i=0; i<384U && out[i] == 0U; i++) {
            /* Search for first non-zero. */
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 112U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }

    return err;
}
#endif /* WOLFSSL_HAVE_SP_DH */

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_1536(const mp_int* base, const mp_int* exp, const mp_int* mod,
    mp_int* res)
{
#ifdef WOLFSSL_SP_SMALL
    int err = MP_OKAY;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[56 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 1536) {
        err = MP_READ_E;
    }
    else if (expBits > 1536) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 1536) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 56 * 4, NULL,
            DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 56 * 2;
        m = e + 56;
        r = b;

        sp_3072_from_mp(b, 56, base);
        sp_3072_from_mp(e, 56, exp);
        sp_3072_from_mp(m, 56, mod);

        err = sp_3072_mod_exp_56(r, b, e, mp_count_bits(exp), m, 0);
    }

    if (err == MP_OKAY) {
        XMEMSET(r + 56, 0, sizeof(*r) * 56U);
        err = sp_3072_to_mp(r, res);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 112U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }
    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[56 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 1536) {
        err = MP_READ_E;
    }
    else if (expBits > 1536) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 1536) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 56 * 4, NULL, DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 56 * 2;
        m = e + 56;
        r = b;

        sp_3072_from_mp(b, 56, base);
        sp_3072_from_mp(e, 56, exp);
        sp_3072_from_mp(m, 56, mod);

        err = sp_3072_mod_exp_56(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        XMEMSET(r + 56, 0, sizeof(*r) * 56U);
        err = sp_3072_to_mp(r, res);
    }


#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 112U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }

    return err;
#endif
}

#endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */

#endif /* WOLFSSL_SP_SMALL */
#endif /* !WOLFSSL_SP_NO_3072 */

#ifdef WOLFSSL_SP_4096
#ifdef WOLFSSL_SP_SMALL
/* Read big endian unsigned byte array into r.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_4096_from_bin(sp_digit* r, int size, const byte* a, int n)
{
    int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = n-1; i >= 0; i--) {
        r[j] |= (((sp_digit)a[i]) << s);
        if (s >= 21U) {
            r[j] &= 0x1fffffff;
            s = 29U - s;
            if (j + 1 >= size) {
                break;
            }
            r[++j] = (sp_digit)a[i] >> s;
            s = 8U - s;
        }
        else {
            s += 8U;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  A multi-precision integer.
 */
static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
{
#if DIGIT_BIT == 29
    int i;
    sp_digit j = (sp_digit)0 - (sp_digit)a->used;
    int o = 0;

    for (i = 0; i < size; i++) {
        sp_digit mask = (sp_digit)0 - (j >> 28);
        r[i] = a->dp[o] & mask;
        j++;
        o += (int)(j >> 28);
    }
#elif DIGIT_BIT > 29
    unsigned int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i] << s);
        r[j] &= 0x1fffffff;
        s = 29U - s;
        if (j + 1 >= size) {
            break;
        }
        /* lint allow cast of mismatch word32 and mp_digit */
        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
        while ((s + 29U) <= (word32)DIGIT_BIT) {
            s += 29U;
            r[j] &= 0x1fffffff;
            if (j + 1 >= size) {
                break;
            }
            if (s < (word32)DIGIT_BIT) {
                /* lint allow cast of mismatch word32 and mp_digit */
                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
            }
            else {
                r[++j] = (sp_digit)0;
            }
        }
        s = (word32)DIGIT_BIT - s;
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#else
    unsigned int i;
    int j = 0;
    int s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 29) {
            r[j] &= 0x1fffffff;
            if (j + 1 >= size) {
                break;
            }
            s = 29 - s;
            if (s == DIGIT_BIT) {
                r[++j] = 0;
                s = 0;
            }
            else {
                r[++j] = a->dp[i] >> s;
                s = DIGIT_BIT - s;
            }
        }
        else {
            s += DIGIT_BIT;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#endif
}

/* Write r as big endian to byte array.
 * Fixed length number of bytes written: 512
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_4096_to_bin_142(sp_digit* r, byte* a)
{
    int i;
    int j;
    int s = 0;
    int b;

    for (i=0; i<141; i++) {
        r[i+1] += r[i] >> 29;
        r[i] &= 0x1fffffff;
    }
    j = 4103 / 8 - 1;
    a[j] = 0;
    for (i=0; i<142 && j>=0; i++) {
        b = 0;
        /* lint allow cast of mismatch sp_digit and int */
        a[j--] |= (byte)(r[i] << s); /*lint !e9033*/
        b += 8 - s;
        if (j < 0) {
            break;
        }
        while (b < 29) {
            a[j--] = (byte)(r[i] >> b);
            b += 8;
            if (j < 0) {
                break;
            }
        }
        s = 8 - (b - 29);
        if (j >= 0) {
            a[j] = 0;
        }
        if (s != 0) {
            j++;
        }
    }
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
#if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
/* Normalize the values in each word to 29 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_4096_norm_71(sp_digit* a)
{
    int i;
    for (i = 0; i < 70; i++) {
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
}

#endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */
#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
/* Normalize the values in each word to 29 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_4096_norm_142(sp_digit* a)
{
    int i;
    for (i = 0; i < 141; i++) {
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_4096_mul_142(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    int i;
    int imax;
    int k;
    sp_uint64 c;
    sp_uint64 lo;

    c = ((sp_uint64)a[141]) * b[141];
    r[283] = (sp_digit)(c >> 29);
    c &= 0x1fffffff;
    for (k = 281; k >= 0; k--) {
        if (k >= 142) {
            i = k - 141;
            imax = 141;
        }
        else {
            i = 0;
            imax = k;
        }
        if (imax - i > 15) {
            int imaxlo;
            lo = 0;
            for (imaxlo = i; imaxlo <= imax; imaxlo += 15) {
                for (; i <= imax && i < imaxlo + 15; i++) {
                    lo += ((sp_uint64)a[i]) * b[k - i];
                }
                c += lo >> 29;
                lo &= 0x1fffffff;
            }
            r[k + 2] += (sp_digit)(c >> 29);
            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
            c = lo & 0x1fffffff;
        }
        else {
            lo = 0;
            for (; i <= imax; i++) {
                lo += ((sp_uint64)a[i]) * b[k - i];
            }
            c += lo >> 29;
            r[k + 2] += (sp_digit)(c >> 29);
            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
            c = lo & 0x1fffffff;
        }
    }
    r[0] = (sp_digit)c;
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_4096_sqr_142(sp_digit* r, const sp_digit* a)
{
    int i;
    int imax;
    int k;
    sp_uint64 c;
    sp_uint64 t;

    c = ((sp_uint64)a[141]) * a[141];
    r[283] = (sp_digit)(c >> 29);
    c = (c & 0x1fffffff) << 29;
    for (k = 281; k >= 0; k--) {
        i = (k + 1) / 2;
        if ((k & 1) == 0) {
           c += ((sp_uint64)a[i]) * a[i];
           i++;
        }
        if (k < 141) {
            imax = k;
        }
        else {
            imax = 141;
        }
        if (imax - i >= 14) {
            int imaxlo;
            sp_uint64 hi;

            hi = c >> 29;
            c &= 0x1fffffff;
            for (imaxlo = i; imaxlo <= imax; imaxlo += 14) {
                t = 0;
                for (; i <= imax && i < imaxlo + 14; i++) {
                    t += ((sp_uint64)a[i]) * a[k - i];
                }
                c += t * 2;

                hi += c >> 29;
                c &= 0x1fffffff;
            }
            r[k + 2] += (sp_digit)(hi >> 29);
            r[k + 1]  = (sp_digit)(hi & 0x1fffffff);
            c <<= 29;
        }
        else
        {
            t = 0;
            for (; i <= imax; i++) {
                t += ((sp_uint64)a[i]) * a[k - i];
            }
            c += t * 2;

            r[k + 2] += (sp_digit) (c >> 58);
            r[k + 1]  = (sp_digit)((c >> 29) & 0x1fffffff);
            c = (c & 0x1fffffff) << 29;
        }
    }
    r[0] = (sp_digit)(c >> 29);
}

/* Calculate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_4096_mont_setup(const sp_digit* a, sp_digit* rho)
{
    sp_digit x;
    sp_digit b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
    x &= 0x1fffffff;

    /* rho = -1/m mod b */
    *rho = ((sp_digit)1 << 29) - x;
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_4096_mul_d_142(sp_digit* r, const sp_digit* a,
    sp_digit b)
{
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 142; i++) {
        t += tb * a[i];
        r[i] = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
    }
    r[142] = (sp_digit)t;
}

#if (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)
#if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_4096_sub_71(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 71; i++) {
        r[i] = a[i] - b[i];
    }

    return 0;
}

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 4096 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_4096_mont_norm_71(sp_digit* r, const sp_digit* m)
{
    /* Set r = 2^n - 1. */
    int i;

    for (i=0; i<70; i++) {
        r[i] = 0x1fffffff;
    }
    r[70] = 0x3ffffL;

    /* r = (2^n - 1) mod n */
    (void)sp_4096_sub_71(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_4096_cmp_71(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
    int i;

    for (i=70; i>=0; i--) {
        r |= (a[i] - b[i]) & ~(((sp_digit)0 - r) >> 28);
    }

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_4096_cond_sub_71(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 71; i++) {
        r[i] = a[i] - (b[i] & m);
    }
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_4096_mul_add_71(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifndef WOLFSSL_SP_LARGE_CODE
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 71; i++) {
        t += r[i];
        t += tb * a[i];
        r[i] = ((sp_digit)t) & 0x1fffffff;
        t >>= 29;
    }
    r[71] += (sp_digit)t;
#else
    sp_int64 tb = b;
    sp_int64 t[4];
    int i;

    t[0] = 0;
    for (i = 0; i < 68; i += 4) {
        t[0] += (tb * a[i+0]) + r[i+0];
        t[1]  = (tb * a[i+1]) + r[i+1];
        t[2]  = (tb * a[i+2]) + r[i+2];
        t[3]  = (tb * a[i+3]) + r[i+3];
        r[i+0] = t[0] & 0x1fffffff;
        t[1] += t[0] >> 29;
        r[i+1] = t[1] & 0x1fffffff;
        t[2] += t[1] >> 29;
        r[i+2] = t[2] & 0x1fffffff;
        t[3] += t[2] >> 29;
        r[i+3] = t[3] & 0x1fffffff;
        t[0]  = t[3] >> 29;
    }
    t[0] += (tb * a[68]) + r[68];
    t[1]  = (tb * a[69]) + r[69];
    t[2]  = (tb * a[70]) + r[70];
    r[68] = t[0] & 0x1fffffff;
    t[1] += t[0] >> 29;
    r[69] = t[1] & 0x1fffffff;
    t[2] += t[1] >> 29;
    r[70] = t[2] & 0x1fffffff;
    r[71] +=  (sp_digit)(t[2] >> 29);
#endif /* !WOLFSSL_SP_LARGE_CODE */
}

/* Shift the result in the high 2048 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_4096_mont_shift_71(sp_digit* r, const sp_digit* a)
{
    int i;
    sp_int64 n = a[70] >> 18;
    n += ((sp_int64)a[71]) << 11;

    for (i = 0; i < 70; i++) {
        r[i] = n & 0x1fffffff;
        n >>= 29;
        n += ((sp_int64)a[72 + i]) << 11;
    }
    r[70] = (sp_digit)n;
    XMEMSET(&r[71], 0, sizeof(*r) * 71U);
}

/* Reduce the number back to 4096 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_4096_mont_reduce_71(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;
    sp_digit over;

    sp_4096_norm_71(a + 71);

    for (i=0; i<70; i++) {
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x1fffffff;
        sp_4096_mul_add_71(a+i, m, mu);
        a[i+1] += a[i] >> 29;
    }
    mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x3ffffL;
    sp_4096_mul_add_71(a+i, m, mu);
    a[i+1] += a[i] >> 29;
    a[i] &= 0x1fffffff;
    sp_4096_mont_shift_71(a, a);
    over = a[70] - m[70];
    sp_4096_cond_sub_71(a, a, m, ~((over - 1) >> 31));
    sp_4096_norm_71(a);
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_4096_mul_71(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    int i;
    int imax;
    int k;
    sp_uint64 c;
    sp_uint64 lo;

    c = ((sp_uint64)a[70]) * b[70];
    r[141] = (sp_digit)(c >> 29);
    c &= 0x1fffffff;
    for (k = 139; k >= 0; k--) {
        if (k >= 71) {
            i = k - 70;
            imax = 70;
        }
        else {
            i = 0;
            imax = k;
        }
        if (imax - i > 15) {
            int imaxlo;
            lo = 0;
            for (imaxlo = i; imaxlo <= imax; imaxlo += 15) {
                for (; i <= imax && i < imaxlo + 15; i++) {
                    lo += ((sp_uint64)a[i]) * b[k - i];
                }
                c += lo >> 29;
                lo &= 0x1fffffff;
            }
            r[k + 2] += (sp_digit)(c >> 29);
            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
            c = lo & 0x1fffffff;
        }
        else {
            lo = 0;
            for (; i <= imax; i++) {
                lo += ((sp_uint64)a[i]) * b[k - i];
            }
            c += lo >> 29;
            r[k + 2] += (sp_digit)(c >> 29);
            r[k + 1]  = (sp_digit)(c & 0x1fffffff);
            c = lo & 0x1fffffff;
        }
    }
    r[0] = (sp_digit)c;
}

/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_4096_mont_mul_71(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_4096_mul_71(r, a, b);
    sp_4096_mont_reduce_71(r, m, mp);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_4096_sqr_71(sp_digit* r, const sp_digit* a)
{
    int i;
    int imax;
    int k;
    sp_uint64 c;
    sp_uint64 t;

    c = ((sp_uint64)a[70]) * a[70];
    r[141] = (sp_digit)(c >> 29);
    c = (c & 0x1fffffff) << 29;
    for (k = 139; k >= 0; k--) {
        i = (k + 1) / 2;
        if ((k & 1) == 0) {
           c += ((sp_uint64)a[i]) * a[i];
           i++;
        }
        if (k < 70) {
            imax = k;
        }
        else {
            imax = 70;
        }
        if (imax - i >= 14) {
            int imaxlo;
            sp_uint64 hi;

            hi = c >> 29;
            c &= 0x1fffffff;
            for (imaxlo = i; imaxlo <= imax; imaxlo += 14) {
                t = 0;
                for (; i <= imax && i < imaxlo + 14; i++) {
                    t += ((sp_uint64)a[i]) * a[k - i];
                }
                c += t * 2;

                hi += c >> 29;
                c &= 0x1fffffff;
            }
            r[k + 2] += (sp_digit)(hi >> 29);
            r[k + 1]  = (sp_digit)(hi & 0x1fffffff);
            c <<= 29;
        }
        else
        {
            t = 0;
            for (; i <= imax; i++) {
                t += ((sp_uint64)a[i]) * a[k - i];
            }
            c += t * 2;

            r[k + 2] += (sp_digit) (c >> 58);
            r[k + 1]  = (sp_digit)((c >> 29) & 0x1fffffff);
            c = (c & 0x1fffffff) << 29;
        }
    }
    r[0] = (sp_digit)(c >> 29);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_4096_mont_sqr_71(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_4096_sqr_71(r, a);
    sp_4096_mont_reduce_71(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_4096_mul_d_71(sp_digit* r, const sp_digit* a,
    sp_digit b)
{
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 71; i++) {
        t += tb * a[i];
        r[i] = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
    }
    r[71] = (sp_digit)t;
}

#ifdef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_4096_cond_add_71(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 71; i++) {
        r[i] = a[i] + (b[i] & m);
    }
}
#endif /* WOLFSSL_SP_SMALL */

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_4096_add_71(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 71; i++) {
        r[i] = a[i] + b[i];
    }

    return 0;
}

SP_NOINLINE static void sp_4096_rshift_71(sp_digit* r, const sp_digit* a,
        byte n)
{
    int i;

    for (i=0; i<70; i++) {
        r[i] = ((a[i] >> n) | (a[i + 1] << (29 - n))) & 0x1fffffff;
    }
    r[70] = a[70] >> n;
}

static WC_INLINE sp_digit sp_4096_div_word_71(sp_digit d1, sp_digit d0,
    sp_digit div)
{
#ifdef SP_USE_DIVTI3
    sp_int64 d = ((sp_int64)d1 << 29) + d0;

    return d / div;
#elif defined(__x86_64__) || defined(__i386__)
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_uint32 lo = (sp_uint32)d;
    sp_digit hi = (sp_digit)(d >> 32);

    __asm__ __volatile__ (
        "idiv %2"
        : "+a" (lo)
        : "d" (hi), "r" (div)
        : "cc"
    );

    return (sp_digit)lo;
#elif !defined(__aarch64__) &&  !defined(SP_DIV_WORD_USE_DIV)
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_digit dv = (div >> 1) + 1;
    sp_digit t1 = (sp_digit)(d >> 29);
    sp_digit t0 = (sp_digit)(d & 0x1fffffff);
    sp_digit t2;
    sp_digit sign;
    sp_digit r;
    int i;
    sp_int64 m;

    r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
    t1 -= dv & (0 - r);
    for (i = 27; i >= 1; i--) {
        t1 += t1 + (((sp_uint32)t0 >> 28) & 1);
        t0 <<= 1;
        t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
        r += r + t2;
        t1 -= dv & (0 - t2);
        t1 += t2;
    }
    r += r + 1;

    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 29);
    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 58) - (sp_digit)(d >> 58);

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;
   return r;
#else
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_digit r = 0;
    sp_digit t;
    sp_digit dv = (div >> 14) + 1;

    t = (sp_digit)(d >> 28);
    t = (t / dv) << 14;
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)(d >> 13);
    t = t / (dv << 1);
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)d;
    t = t / div;
    r += t;
    d -= (sp_int64)t * div;
    return r;
#endif
}
static WC_INLINE sp_digit sp_4096_word_div_word_71(sp_digit d, sp_digit div)
{
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \
    defined(SP_DIV_WORD_USE_DIV)
    return d / div;
#else
    return (sp_digit)((sp_uint32)(div - d) >> 31);
#endif
}
/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * Full implementation.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_4096_div_71(const sp_digit* a, const sp_digit* d,
        const sp_digit* m, sp_digit* r)
{
    int i;
#ifndef WOLFSSL_SP_DIV_32
#endif
    sp_digit dv;
    sp_digit r1;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* t1 = NULL;
#else
    sp_digit t1[4 * 71 + 3];
#endif
    sp_digit* t2 = NULL;
    sp_digit* sd = NULL;
    int err = MP_OKAY;

    (void)m;

#ifdef WOLFSSL_SP_SMALL_STACK
    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 71 + 3), NULL,
                                                       DYNAMIC_TYPE_TMP_BUFFER);
    if (t1 == NULL)
        err = MEMORY_E;
#endif

    (void)m;

    if (err == MP_OKAY) {
        t2 = t1 + 142 + 1;
        sd = t2 + 71 + 1;

        sp_4096_mul_d_71(sd, d, (sp_digit)1 << 11);
        sp_4096_mul_d_142(t1, a, (sp_digit)1 << 11);
        dv = sd[70];
        t1[71 + 71] += t1[71 + 71 - 1] >> 29;
        t1[71 + 71 - 1] &= 0x1fffffff;
        for (i=71; i>=0; i--) {
            r1 = sp_4096_div_word_71(t1[71 + i], t1[71 + i - 1], dv);

            sp_4096_mul_d_71(t2, sd, r1);
            (void)sp_4096_sub_71(&t1[i], &t1[i], t2);
            sp_4096_norm_71(&t1[i]);
            t1[71 + i] -= t2[71];
            t1[71 + i] += t1[71 + i - 1] >> 29;
            t1[71 + i - 1] &= 0x1fffffff;
            r1 = sp_4096_div_word_71(-t1[71 + i], -t1[71 + i - 1], dv);
            r1 -= t1[71 + i];
            sp_4096_mul_d_71(t2, sd, r1);
            (void)sp_4096_add_71(&t1[i], &t1[i], t2);
            t1[71 + i] += t1[71 + i - 1] >> 29;
            t1[71 + i - 1] &= 0x1fffffff;
        }
        t1[71 - 1] += t1[71 - 2] >> 29;
        t1[71 - 2] &= 0x1fffffff;
        r1 = sp_4096_word_div_word_71(t1[71 - 1], dv);

        sp_4096_mul_d_71(t2, sd, r1);
        sp_4096_sub_71(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 142U);
        for (i=0; i<70; i++) {
            r[i+1] += r[i] >> 29;
            r[i] &= 0x1fffffff;
        }
        sp_4096_cond_add_71(r, r, sd, r[70] >> 31);

        sp_4096_norm_71(r);
        sp_4096_rshift_71(r, r, 11);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t1 != NULL)
        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_4096_mod_71(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_4096_div_71(a, m, NULL, r);
}

/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_4096_mod_exp_71(sp_digit* r, const sp_digit* a, const sp_digit* e,
    int bits, const sp_digit* m, int reduceA)
{
#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 142];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 71 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 71 * 2);
            XMEMSET(t[i], 0, sizeof(sp_digit) * 71U * 2U);
        }

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_71(norm, m);

        if (reduceA != 0) {
            err = sp_4096_mod_71(t[1], a, m);
        }
        else {
            XMEMCPY(t[1], a, sizeof(sp_digit) * 71U);
        }
    }
    if (err == MP_OKAY) {
        sp_4096_mul_71(t[1], t[1], norm);
        err = sp_4096_mod_71(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 29;
        c = bits % 29;
        n = e[i--] << (29 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 29;
            }

            y = (int)((n >> 28) & 1);
            n <<= 1;

            sp_4096_mont_mul_71(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 71 * 2);
            sp_4096_mont_sqr_71(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 71 * 2);
        }

        sp_4096_mont_reduce_71(t[0], m, mp);
        n = sp_4096_cmp_71(t[0], m);
        sp_4096_cond_sub_71(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 71 * 2);

    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#elif !defined(WC_NO_CACHE_RESISTANT)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 142];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 71 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 71 * 2);
        }

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_71(norm, m);

        if (reduceA != 0) {
            err = sp_4096_mod_71(t[1], a, m);
            if (err == MP_OKAY) {
                sp_4096_mul_71(t[1], t[1], norm);
                err = sp_4096_mod_71(t[1], t[1], m);
            }
        }
        else {
            sp_4096_mul_71(t[1], a, norm);
            err = sp_4096_mod_71(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 29;
        c = bits % 29;
        n = e[i--] << (29 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 29;
            }

            y = (int)((n >> 28) & 1);
            n <<= 1;

            sp_4096_mont_mul_71(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 71 * 2);
            sp_4096_mont_sqr_71(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 71 * 2);
        }

        sp_4096_mont_reduce_71(t[0], m, mp);
        n = sp_4096_cmp_71(t[0], m);
        sp_4096_cond_sub_71(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 71 * 2);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[(32 * 142) + 142];
#endif
    sp_digit* t[32];
    sp_digit* rt = NULL;
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 142) + 142), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<32; i++)
            t[i] = td + i * 142;
        rt = td + 4544;

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_71(norm, m);

        if (reduceA != 0) {
            err = sp_4096_mod_71(t[1], a, m);
            if (err == MP_OKAY) {
                sp_4096_mul_71(t[1], t[1], norm);
                err = sp_4096_mod_71(t[1], t[1], m);
            }
        }
        else {
            sp_4096_mul_71(t[1], a, norm);
            err = sp_4096_mod_71(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_4096_mont_sqr_71(t[ 2], t[ 1], m, mp);
        sp_4096_mont_mul_71(t[ 3], t[ 2], t[ 1], m, mp);
        sp_4096_mont_sqr_71(t[ 4], t[ 2], m, mp);
        sp_4096_mont_mul_71(t[ 5], t[ 3], t[ 2], m, mp);
        sp_4096_mont_sqr_71(t[ 6], t[ 3], m, mp);
        sp_4096_mont_mul_71(t[ 7], t[ 4], t[ 3], m, mp);
        sp_4096_mont_sqr_71(t[ 8], t[ 4], m, mp);
        sp_4096_mont_mul_71(t[ 9], t[ 5], t[ 4], m, mp);
        sp_4096_mont_sqr_71(t[10], t[ 5], m, mp);
        sp_4096_mont_mul_71(t[11], t[ 6], t[ 5], m, mp);
        sp_4096_mont_sqr_71(t[12], t[ 6], m, mp);
        sp_4096_mont_mul_71(t[13], t[ 7], t[ 6], m, mp);
        sp_4096_mont_sqr_71(t[14], t[ 7], m, mp);
        sp_4096_mont_mul_71(t[15], t[ 8], t[ 7], m, mp);
        sp_4096_mont_sqr_71(t[16], t[ 8], m, mp);
        sp_4096_mont_mul_71(t[17], t[ 9], t[ 8], m, mp);
        sp_4096_mont_sqr_71(t[18], t[ 9], m, mp);
        sp_4096_mont_mul_71(t[19], t[10], t[ 9], m, mp);
        sp_4096_mont_sqr_71(t[20], t[10], m, mp);
        sp_4096_mont_mul_71(t[21], t[11], t[10], m, mp);
        sp_4096_mont_sqr_71(t[22], t[11], m, mp);
        sp_4096_mont_mul_71(t[23], t[12], t[11], m, mp);
        sp_4096_mont_sqr_71(t[24], t[12], m, mp);
        sp_4096_mont_mul_71(t[25], t[13], t[12], m, mp);
        sp_4096_mont_sqr_71(t[26], t[13], m, mp);
        sp_4096_mont_mul_71(t[27], t[14], t[13], m, mp);
        sp_4096_mont_sqr_71(t[28], t[14], m, mp);
        sp_4096_mont_mul_71(t[29], t[15], t[14], m, mp);
        sp_4096_mont_sqr_71(t[30], t[15], m, mp);
        sp_4096_mont_mul_71(t[31], t[16], t[15], m, mp);

        bits = ((bits + 4) / 5) * 5;
        i = ((bits + 28) / 29) - 1;
        c = bits % 29;
        if (c == 0) {
            c = 29;
        }
        if (i < 71) {
            n = e[i--] << (32 - c);
        }
        else {
            n = 0;
            i--;
        }
        if (c < 5) {
            n |= e[i--] << (3 - c);
            c += 29;
        }
        y = (int)((n >> 27) & 0x1f);
        n <<= 5;
        c -= 5;
        XMEMCPY(rt, t[y], sizeof(sp_digit) * 142);
        while ((i >= 0) || (c >= 5)) {
            if (c >= 5) {
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c -= 5;
            }
            else if (c == 0) {
                n = e[i--] << 3;
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c = 24;
            }
            else {
                y = (byte)((n >> 27) & 0x1f);
                n = e[i--] << 3;
                c = 5 - c;
                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
                n <<= c;
                c = 29 - c;
            }

            sp_4096_mont_sqr_71(rt, rt, m, mp);
            sp_4096_mont_sqr_71(rt, rt, m, mp);
            sp_4096_mont_sqr_71(rt, rt, m, mp);
            sp_4096_mont_sqr_71(rt, rt, m, mp);
            sp_4096_mont_sqr_71(rt, rt, m, mp);

            sp_4096_mont_mul_71(rt, rt, t[y], m, mp);
        }

        sp_4096_mont_reduce_71(rt, m, mp);
        n = sp_4096_cmp_71(rt, m);
        sp_4096_cond_sub_71(rt, rt, m, ~(n >> 31));
        XMEMCPY(r, rt, sizeof(sp_digit) * 142);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}

#endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */
#endif /* (WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH) & !WOLFSSL_RSA_PUBLIC_ONLY */

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_4096_sub_142(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 142; i++) {
        r[i] = a[i] - b[i];
    }

    return 0;
}

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 4096 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_4096_mont_norm_142(sp_digit* r, const sp_digit* m)
{
    /* Set r = 2^n - 1. */
    int i;

    for (i=0; i<141; i++) {
        r[i] = 0x1fffffff;
    }
    r[141] = 0x7fL;

    /* r = (2^n - 1) mod n */
    (void)sp_4096_sub_142(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_4096_cmp_142(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
    int i;

    for (i=141; i>=0; i--) {
        r |= (a[i] - b[i]) & ~(((sp_digit)0 - r) >> 28);
    }

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_4096_cond_sub_142(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 142; i++) {
        r[i] = a[i] - (b[i] & m);
    }
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_4096_mul_add_142(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifndef WOLFSSL_SP_LARGE_CODE
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 142; i++) {
        t += r[i];
        t += tb * a[i];
        r[i] = ((sp_digit)t) & 0x1fffffff;
        t >>= 29;
    }
    r[142] += (sp_digit)t;
#else
    sp_int64 tb = b;
    sp_int64 t[4];
    int i;

    t[0] = 0;
    for (i = 0; i < 140; i += 4) {
        t[0] += (tb * a[i+0]) + r[i+0];
        t[1]  = (tb * a[i+1]) + r[i+1];
        t[2]  = (tb * a[i+2]) + r[i+2];
        t[3]  = (tb * a[i+3]) + r[i+3];
        r[i+0] = t[0] & 0x1fffffff;
        t[1] += t[0] >> 29;
        r[i+1] = t[1] & 0x1fffffff;
        t[2] += t[1] >> 29;
        r[i+2] = t[2] & 0x1fffffff;
        t[3] += t[2] >> 29;
        r[i+3] = t[3] & 0x1fffffff;
        t[0]  = t[3] >> 29;
    }
    t[0] += (tb * a[140]) + r[140];
    t[1]  = (tb * a[141]) + r[141];
    r[140] = t[0] & 0x1fffffff;
    t[1] += t[0] >> 29;
    r[141] = t[1] & 0x1fffffff;
    r[142] +=  (sp_digit)(t[1] >> 29);
#endif /* !WOLFSSL_SP_LARGE_CODE */
}

/* Shift the result in the high 4096 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_4096_mont_shift_142(sp_digit* r, const sp_digit* a)
{
    int i;
    sp_int64 n = a[141] >> 7;
    n += ((sp_int64)a[142]) << 22;

    for (i = 0; i < 141; i++) {
        r[i] = n & 0x1fffffff;
        n >>= 29;
        n += ((sp_int64)a[143 + i]) << 22;
    }
    r[141] = (sp_digit)n;
    XMEMSET(&r[142], 0, sizeof(*r) * 142U);
}

/* Reduce the number back to 4096 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_4096_mont_reduce_142(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;
    sp_digit over;

    sp_4096_norm_142(a + 142);

#ifdef WOLFSSL_SP_DH
    if (mp != 1) {
        for (i=0; i<141; i++) {
            mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x1fffffff;
            sp_4096_mul_add_142(a+i, m, mu);
            a[i+1] += a[i] >> 29;
        }
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x7fL;
        sp_4096_mul_add_142(a+i, m, mu);
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
    else {
        for (i=0; i<141; i++) {
            mu = a[i] & 0x1fffffff;
            sp_4096_mul_add_142(a+i, m, mu);
            a[i+1] += a[i] >> 29;
        }
        mu = a[i] & 0x7fL;
        sp_4096_mul_add_142(a+i, m, mu);
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
#else
    for (i=0; i<141; i++) {
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x1fffffff;
        sp_4096_mul_add_142(a+i, m, mu);
        a[i+1] += a[i] >> 29;
    }
    mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x7fL;
    sp_4096_mul_add_142(a+i, m, mu);
    a[i+1] += a[i] >> 29;
    a[i] &= 0x1fffffff;
#endif
    sp_4096_mont_shift_142(a, a);
    over = a[141] - m[141];
    sp_4096_cond_sub_142(a, a, m, ~((over - 1) >> 31));
    sp_4096_norm_142(a);
}

/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_4096_mont_mul_142(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_4096_mul_142(r, a, b);
    sp_4096_mont_reduce_142(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_4096_mont_sqr_142(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_4096_sqr_142(r, a);
    sp_4096_mont_reduce_142(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_4096_mul_d_284(sp_digit* r, const sp_digit* a,
    sp_digit b)
{
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 284; i++) {
        t += tb * a[i];
        r[i] = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
    }
    r[284] = (sp_digit)t;
}

#ifdef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_4096_cond_add_142(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 142; i++) {
        r[i] = a[i] + (b[i] & m);
    }
}
#endif /* WOLFSSL_SP_SMALL */

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_4096_add_142(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 142; i++) {
        r[i] = a[i] + b[i];
    }

    return 0;
}

SP_NOINLINE static void sp_4096_rshift_142(sp_digit* r, const sp_digit* a,
        byte n)
{
    int i;

    for (i=0; i<141; i++) {
        r[i] = ((a[i] >> n) | (a[i + 1] << (29 - n))) & 0x1fffffff;
    }
    r[141] = a[141] >> n;
}

static WC_INLINE sp_digit sp_4096_div_word_142(sp_digit d1, sp_digit d0,
    sp_digit div)
{
#ifdef SP_USE_DIVTI3
    sp_int64 d = ((sp_int64)d1 << 29) + d0;

    return d / div;
#elif defined(__x86_64__) || defined(__i386__)
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_uint32 lo = (sp_uint32)d;
    sp_digit hi = (sp_digit)(d >> 32);

    __asm__ __volatile__ (
        "idiv %2"
        : "+a" (lo)
        : "d" (hi), "r" (div)
        : "cc"
    );

    return (sp_digit)lo;
#elif !defined(__aarch64__) &&  !defined(SP_DIV_WORD_USE_DIV)
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_digit dv = (div >> 1) + 1;
    sp_digit t1 = (sp_digit)(d >> 29);
    sp_digit t0 = (sp_digit)(d & 0x1fffffff);
    sp_digit t2;
    sp_digit sign;
    sp_digit r;
    int i;
    sp_int64 m;

    r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
    t1 -= dv & (0 - r);
    for (i = 27; i >= 1; i--) {
        t1 += t1 + (((sp_uint32)t0 >> 28) & 1);
        t0 <<= 1;
        t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
        r += r + t2;
        t1 -= dv & (0 - t2);
        t1 += t2;
    }
    r += r + 1;

    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 29);
    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 58) - (sp_digit)(d >> 58);

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;
   return r;
#else
    sp_int64 d = ((sp_int64)d1 << 29) + d0;
    sp_digit r = 0;
    sp_digit t;
    sp_digit dv = (div >> 14) + 1;

    t = (sp_digit)(d >> 28);
    t = (t / dv) << 14;
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)(d >> 13);
    t = t / (dv << 1);
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)d;
    t = t / div;
    r += t;
    d -= (sp_int64)t * div;
    return r;
#endif
}
static WC_INLINE sp_digit sp_4096_word_div_word_142(sp_digit d, sp_digit div)
{
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \
    defined(SP_DIV_WORD_USE_DIV)
    return d / div;
#else
    return (sp_digit)((sp_uint32)(div - d) >> 31);
#endif
}
/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * Full implementation.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_4096_div_142(const sp_digit* a, const sp_digit* d,
        const sp_digit* m, sp_digit* r)
{
    int i;
#ifndef WOLFSSL_SP_DIV_32
#endif
    sp_digit dv;
    sp_digit r1;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* t1 = NULL;
#else
    sp_digit t1[4 * 142 + 3];
#endif
    sp_digit* t2 = NULL;
    sp_digit* sd = NULL;
    int err = MP_OKAY;

    (void)m;

#ifdef WOLFSSL_SP_SMALL_STACK
    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 142 + 3), NULL,
                                                       DYNAMIC_TYPE_TMP_BUFFER);
    if (t1 == NULL)
        err = MEMORY_E;
#endif

    (void)m;

    if (err == MP_OKAY) {
        t2 = t1 + 284 + 1;
        sd = t2 + 142 + 1;

        sp_4096_mul_d_142(sd, d, (sp_digit)1 << 22);
        sp_4096_mul_d_284(t1, a, (sp_digit)1 << 22);
        dv = sd[141];
        t1[142 + 142] += t1[142 + 142 - 1] >> 29;
        t1[142 + 142 - 1] &= 0x1fffffff;
        for (i=142; i>=0; i--) {
            r1 = sp_4096_div_word_142(t1[142 + i], t1[142 + i - 1], dv);

            sp_4096_mul_d_142(t2, sd, r1);
            (void)sp_4096_sub_142(&t1[i], &t1[i], t2);
            sp_4096_norm_142(&t1[i]);
            t1[142 + i] -= t2[142];
            t1[142 + i] += t1[142 + i - 1] >> 29;
            t1[142 + i - 1] &= 0x1fffffff;
            r1 = sp_4096_div_word_142(-t1[142 + i], -t1[142 + i - 1], dv);
            r1 -= t1[142 + i];
            sp_4096_mul_d_142(t2, sd, r1);
            (void)sp_4096_add_142(&t1[i], &t1[i], t2);
            t1[142 + i] += t1[142 + i - 1] >> 29;
            t1[142 + i - 1] &= 0x1fffffff;
        }
        t1[142 - 1] += t1[142 - 2] >> 29;
        t1[142 - 2] &= 0x1fffffff;
        r1 = sp_4096_word_div_word_142(t1[142 - 1], dv);

        sp_4096_mul_d_142(t2, sd, r1);
        sp_4096_sub_142(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 284U);
        for (i=0; i<141; i++) {
            r[i+1] += r[i] >> 29;
            r[i] &= 0x1fffffff;
        }
        sp_4096_cond_add_142(r, r, sd, r[141] >> 31);

        sp_4096_norm_142(r);
        sp_4096_rshift_142(r, r, 22);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t1 != NULL)
        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_4096_mod_142(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_4096_div_142(a, m, NULL, r);
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_4096_mod_exp_142(sp_digit* r, const sp_digit* a, const sp_digit* e,
    int bits, const sp_digit* m, int reduceA)
{
#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 284];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 142 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 142 * 2);
            XMEMSET(t[i], 0, sizeof(sp_digit) * 142U * 2U);
        }

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_142(norm, m);

        if (reduceA != 0) {
            err = sp_4096_mod_142(t[1], a, m);
        }
        else {
            XMEMCPY(t[1], a, sizeof(sp_digit) * 142U);
        }
    }
    if (err == MP_OKAY) {
        sp_4096_mul_142(t[1], t[1], norm);
        err = sp_4096_mod_142(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 29;
        c = bits % 29;
        n = e[i--] << (29 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 29;
            }

            y = (int)((n >> 28) & 1);
            n <<= 1;

            sp_4096_mont_mul_142(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 142 * 2);
            sp_4096_mont_sqr_142(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 142 * 2);
        }

        sp_4096_mont_reduce_142(t[0], m, mp);
        n = sp_4096_cmp_142(t[0], m);
        sp_4096_cond_sub_142(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 142 * 2);

    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#elif !defined(WC_NO_CACHE_RESISTANT)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 284];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 142 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 142 * 2);
        }

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_142(norm, m);

        if (reduceA != 0) {
            err = sp_4096_mod_142(t[1], a, m);
            if (err == MP_OKAY) {
                sp_4096_mul_142(t[1], t[1], norm);
                err = sp_4096_mod_142(t[1], t[1], m);
            }
        }
        else {
            sp_4096_mul_142(t[1], a, norm);
            err = sp_4096_mod_142(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 29;
        c = bits % 29;
        n = e[i--] << (29 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 29;
            }

            y = (int)((n >> 28) & 1);
            n <<= 1;

            sp_4096_mont_mul_142(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 142 * 2);
            sp_4096_mont_sqr_142(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 142 * 2);
        }

        sp_4096_mont_reduce_142(t[0], m, mp);
        n = sp_4096_cmp_142(t[0], m);
        sp_4096_cond_sub_142(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 142 * 2);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[(16 * 284) + 284];
#endif
    sp_digit* t[16];
    sp_digit* rt = NULL;
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((16 * 284) + 284), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<16; i++)
            t[i] = td + i * 284;
        rt = td + 4544;

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_142(norm, m);

        if (reduceA != 0) {
            err = sp_4096_mod_142(t[1], a, m);
            if (err == MP_OKAY) {
                sp_4096_mul_142(t[1], t[1], norm);
                err = sp_4096_mod_142(t[1], t[1], m);
            }
        }
        else {
            sp_4096_mul_142(t[1], a, norm);
            err = sp_4096_mod_142(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_4096_mont_sqr_142(t[ 2], t[ 1], m, mp);
        sp_4096_mont_mul_142(t[ 3], t[ 2], t[ 1], m, mp);
        sp_4096_mont_sqr_142(t[ 4], t[ 2], m, mp);
        sp_4096_mont_mul_142(t[ 5], t[ 3], t[ 2], m, mp);
        sp_4096_mont_sqr_142(t[ 6], t[ 3], m, mp);
        sp_4096_mont_mul_142(t[ 7], t[ 4], t[ 3], m, mp);
        sp_4096_mont_sqr_142(t[ 8], t[ 4], m, mp);
        sp_4096_mont_mul_142(t[ 9], t[ 5], t[ 4], m, mp);
        sp_4096_mont_sqr_142(t[10], t[ 5], m, mp);
        sp_4096_mont_mul_142(t[11], t[ 6], t[ 5], m, mp);
        sp_4096_mont_sqr_142(t[12], t[ 6], m, mp);
        sp_4096_mont_mul_142(t[13], t[ 7], t[ 6], m, mp);
        sp_4096_mont_sqr_142(t[14], t[ 7], m, mp);
        sp_4096_mont_mul_142(t[15], t[ 8], t[ 7], m, mp);

        bits = ((bits + 3) / 4) * 4;
        i = ((bits + 28) / 29) - 1;
        c = bits % 29;
        if (c == 0) {
            c = 29;
        }
        if (i < 142) {
            n = e[i--] << (32 - c);
        }
        else {
            n = 0;
            i--;
        }
        if (c < 4) {
            n |= e[i--] << (3 - c);
            c += 29;
        }
        y = (int)((n >> 28) & 0xf);
        n <<= 4;
        c -= 4;
        XMEMCPY(rt, t[y], sizeof(sp_digit) * 284);
        while ((i >= 0) || (c >= 4)) {
            if (c >= 4) {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }
            else if (c == 0) {
                n = e[i--] << 3;
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c = 25;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n = e[i--] << 3;
                c = 4 - c;
                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
                n <<= c;
                c = 29 - c;
            }

            sp_4096_mont_sqr_142(rt, rt, m, mp);
            sp_4096_mont_sqr_142(rt, rt, m, mp);
            sp_4096_mont_sqr_142(rt, rt, m, mp);
            sp_4096_mont_sqr_142(rt, rt, m, mp);

            sp_4096_mont_mul_142(rt, rt, t[y], m, mp);
        }

        sp_4096_mont_reduce_142(rt, m, mp);
        n = sp_4096_cmp_142(rt, m);
        sp_4096_cond_sub_142(rt, rt, m, ~(n >> 31));
        XMEMCPY(r, rt, sizeof(sp_digit) * 284);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}

#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
#ifdef WOLFSSL_HAVE_SP_RSA
/* RSA public key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * em      Public exponent.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 512 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
    const mp_int* mm, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[142 * 5];
#endif
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    sp_digit* norm = NULL;
    sp_uint64 e[1] = {0};
    sp_digit mp = 0;
    int i;
    int err = MP_OKAY;

    if (*outLen < 512U) {
        err = MP_TO_E;
    }

    if (err == MP_OKAY) {
        if (mp_count_bits(em) > 64) {
            err = MP_READ_E;
        }
        else if (inLen > 512U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 4096) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 142 * 5, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        r = a + 142 * 2;
        m = r + 142 * 2;
        norm = r;

        sp_4096_from_bin(a, 142, in, inLen);
#if DIGIT_BIT >= 64
        e[0] = (sp_uint64)em->dp[0];
#else
        e[0] = (sp_uint64)em->dp[0];
        if (em->used > 1) {
            e[0] |= ((sp_uint64)em->dp[1]) << DIGIT_BIT;
        }
#endif
        if (e[0] == 0) {
            err = MP_EXPTMOD_E;
        }
    }

    if (err == MP_OKAY) {
        sp_4096_from_mp(m, 142, mm);

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_142(norm, m);
    }
    if (err == MP_OKAY) {
        sp_4096_mul_142(a, a, norm);
        err = sp_4096_mod_142(a, a, m);
    }
    if (err == MP_OKAY) {
        for (i=63; i>=0; i--) {
            if ((e[0] >> i) != 0) {
                break;
            }
        }

        XMEMCPY(r, a, sizeof(sp_digit) * 142 * 2);
        for (i--; i>=0; i--) {
            sp_4096_mont_sqr_142(r, r, m, mp);

            if (((e[0] >> i) & 1) == 1) {
                sp_4096_mont_mul_142(r, r, a, m, mp);
            }
        }
        sp_4096_mont_reduce_142(r, m, mp);
        mp = sp_4096_cmp_142(r, m);
        sp_4096_cond_sub_142(r, r, m, ~(mp >> 31));

        sp_4096_to_bin_142(r, out);
        *outLen = 512;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (a != NULL)
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
#endif

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit d[142 * 5];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    sp_uint64 e[1] = {0};
    int err = MP_OKAY;

    if (*outLen < 512U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(em) > 64) {
            err = MP_READ_E;
        }
        else if (inLen > 512U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 4096) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 142 * 5, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d;
        r = a + 142 * 2;
        m = r + 142 * 2;

        sp_4096_from_bin(a, 142, in, inLen);
#if DIGIT_BIT >= 64
        e[0] = (sp_uint64)em->dp[0];
#else
        e[0] = (sp_uint64)em->dp[0];
        if (em->used > 1) {
            e[0] |= ((sp_uint64)em->dp[1]) << DIGIT_BIT;
        }
#endif
        if (e[0] == 0) {
            err = MP_EXPTMOD_E;
        }
    }
    if (err == MP_OKAY) {
        sp_4096_from_mp(m, 142, mm);

        if (e[0] == 0x3) {
            sp_4096_sqr_142(r, a);
            err = sp_4096_mod_142(r, r, m);
            if (err == MP_OKAY) {
                sp_4096_mul_142(r, a, r);
                err = sp_4096_mod_142(r, r, m);
            }
        }
        else {
            sp_digit* norm = r;
            int i;
            sp_digit mp;

            sp_4096_mont_setup(m, &mp);
            sp_4096_mont_norm_142(norm, m);

            sp_4096_mul_142(a, a, norm);
            err = sp_4096_mod_142(a, a, m);

            if (err == MP_OKAY) {
                for (i=63; i>=0; i--) {
                    if ((e[0] >> i) != 0) {
                        break;
                    }
                }

                XMEMCPY(r, a, sizeof(sp_digit) * 284U);
                for (i--; i>=0; i--) {
                    sp_4096_mont_sqr_142(r, r, m, mp);

                    if (((e[0] >> i) & 1) == 1) {
                        sp_4096_mont_mul_142(r, r, a, m, mp);
                    }
                }
                sp_4096_mont_reduce_142(r, m, mp);
                mp = sp_4096_cmp_142(r, m);
                sp_4096_cond_sub_142(r, r, m, ~(mp >> 31));
            }
        }
    }

    if (err == MP_OKAY) {
        sp_4096_to_bin_142(r, out);
        *outLen = 512;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif

    return err;
#endif /* WOLFSSL_SP_SMALL */
}

#ifndef WOLFSSL_RSA_PUBLIC_ONLY
#if !defined(SP_RSA_PRIVATE_EXP_D) && !defined(RSA_LOW_MEM)
#endif /* !SP_RSA_PRIVATE_EXP_D & !RSA_LOW_MEM */
/* RSA private key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * dm      Private exponent.
 * pm      First prime.
 * qm      Second prime.
 * dpm     First prime's CRT exponent.
 * dqm     Second prime's CRT exponent.
 * qim     Inverse of second prime mod p.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 512 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
    const mp_int* pm, const mp_int* qm, const mp_int* dpm, const mp_int* dqm,
    const mp_int* qim, const mp_int* mm, byte* out, word32* outLen)
{
#if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM)
#if defined(WOLFSSL_SP_SMALL)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit  d[142 * 4];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 512U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(dm) > 4096) {
           err = MP_READ_E;
        }
        else if (inLen > 512) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 4096) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 142 * 4, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d + 142;
        m = a + 284;
        r = a;

        sp_4096_from_bin(a, 142, in, inLen);
        sp_4096_from_mp(d, 142, dm);
        sp_4096_from_mp(m, 142, mm);
        err = sp_4096_mod_exp_142(r, a, d, 4096, m, 0);
    }

    if (err == MP_OKAY) {
        sp_4096_to_bin_142(r, out);
        *outLen = 512;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
#endif
    {
        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
        if (a != NULL)
            ForceZero(a, sizeof(sp_digit) * 142);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit d[142 * 4];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 512U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(dm) > 4096) {
            err = MP_READ_E;
        }
        else if (inLen > 512U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 4096) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 142 * 4, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d + 142;
        m = a + 284;
        r = a;

        sp_4096_from_bin(a, 142, in, inLen);
        sp_4096_from_mp(d, 142, dm);
        sp_4096_from_mp(m, 142, mm);
        err = sp_4096_mod_exp_142(r, a, d, 4096, m, 0);
    }

    if (err == MP_OKAY) {
        sp_4096_to_bin_142(r, out);
        *outLen = 512;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
#endif
    {
        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
        if (a != NULL)
            ForceZero(a, sizeof(sp_digit) * 142);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#endif /* WOLFSSL_SP_SMALL */
#else
#if defined(WOLFSSL_SP_SMALL)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[71 * 8];
#endif
    sp_digit* p = NULL;
    sp_digit* dp = NULL;
    sp_digit* dq = NULL;
    sp_digit* qi = NULL;
    sp_digit* tmpa = NULL;
    sp_digit* tmpb = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 512U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (inLen > 512) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 4096) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(pm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(qm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 71 * 8, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif
    if (err == MP_OKAY) {
        p = a + 142;
        qi = dq = dp = p + 71;
        tmpa = qi + 71;
        tmpb = tmpa + 142;
        r = a;

        sp_4096_from_bin(a, 142, in, inLen);
        sp_4096_from_mp(p, 71, pm);
        sp_4096_from_mp(dp, 71, dpm);
        err = sp_4096_mod_exp_71(tmpa, a, dp, 2048, p, 1);
    }
    if (err == MP_OKAY) {
        sp_4096_from_mp(p, 71, qm);
        sp_4096_from_mp(dq, 71, dqm);
        err = sp_4096_mod_exp_71(tmpb, a, dq, 2048, p, 1);
    }
    if (err == MP_OKAY) {
        sp_4096_from_mp(p, 71, pm);
        (void)sp_4096_sub_71(tmpa, tmpa, tmpb);
        sp_4096_norm_71(tmpa);
        sp_4096_cond_add_71(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[70] >> 31));
        sp_4096_cond_add_71(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[70] >> 31));
        sp_4096_norm_71(tmpa);

        sp_4096_from_mp(qi, 71, qim);
        sp_4096_mul_71(tmpa, tmpa, qi);
        err = sp_4096_mod_71(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_4096_from_mp(p, 71, qm);
        sp_4096_mul_71(tmpa, p, tmpa);
        (void)sp_4096_add_142(r, tmpb, tmpa);
        sp_4096_norm_142(r);

        sp_4096_to_bin_142(r, out);
        *outLen = 512;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (a != NULL)
#endif
    {
        ForceZero(a, sizeof(sp_digit) * 71 * 8);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[71 * 13];
#endif
    sp_digit* p = NULL;
    sp_digit* q = NULL;
    sp_digit* dp = NULL;
    sp_digit* dq = NULL;
    sp_digit* qi = NULL;
    sp_digit* tmpa = NULL;
    sp_digit* tmpb = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 512U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (inLen > 512U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 4096) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(pm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(qm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 71 * 13, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        p = a + 142 * 2;
        q = p + 71;
        dp = q + 71;
        dq = dp + 71;
        qi = dq + 71;
        tmpa = qi + 71;
        tmpb = tmpa + 142;
        r = a;

        sp_4096_from_bin(a, 142, in, inLen);
        sp_4096_from_mp(p, 71, pm);
        sp_4096_from_mp(q, 71, qm);
        sp_4096_from_mp(dp, 71, dpm);
        sp_4096_from_mp(dq, 71, dqm);
        sp_4096_from_mp(qi, 71, qim);

        err = sp_4096_mod_exp_71(tmpa, a, dp, 2048, p, 1);
    }
    if (err == MP_OKAY) {
        err = sp_4096_mod_exp_71(tmpb, a, dq, 2048, q, 1);
    }

    if (err == MP_OKAY) {
        (void)sp_4096_sub_71(tmpa, tmpa, tmpb);
        sp_4096_norm_71(tmpa);
        sp_4096_cond_add_71(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[70] >> 31));
        sp_4096_cond_add_71(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[70] >> 31));
        sp_4096_norm_71(tmpa);
        sp_4096_mul_71(tmpa, tmpa, qi);
        err = sp_4096_mod_71(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_4096_mul_71(tmpa, tmpa, q);
        (void)sp_4096_add_142(r, tmpb, tmpa);
        sp_4096_norm_142(r);

        sp_4096_to_bin_142(r, out);
        *outLen = 512;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
if (a != NULL)
#endif
    {
        ForceZero(a, sizeof(sp_digit) * 71 * 13);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
    #endif
    }

    return err;
#endif /* WOLFSSL_SP_SMALL */
#endif /* SP_RSA_PRIVATE_EXP_D || RSA_LOW_MEM */
}

#endif /* !WOLFSSL_RSA_PUBLIC_ONLY */
#endif /* WOLFSSL_HAVE_SP_RSA */
#if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \
                                              !defined(WOLFSSL_RSA_PUBLIC_ONLY))
/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_4096_to_mp(const sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (4096 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
#if DIGIT_BIT == 29
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 142);
        r->used = 142;
        mp_clamp(r);
#elif DIGIT_BIT < 29
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 142; i++) {
            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 29) {
                s += DIGIT_BIT;
                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
                if (s == SP_WORD_SIZE) {
                    r->dp[j] = 0;
                }
                else {
                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 29 - s;
        }
        r->used = (4096 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 142; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 29 >= DIGIT_BIT) {
    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 29 - s;
            }
            else {
                s += 29;
            }
        }
        r->used = (4096 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod,
    mp_int* res)
{
#ifdef WOLFSSL_SP_SMALL
    int err = MP_OKAY;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[142 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 4096) {
        err = MP_READ_E;
    }
    else if (expBits > 4096) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 4096) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 142 * 4, NULL,
            DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 142 * 2;
        m = e + 142;
        r = b;

        sp_4096_from_mp(b, 142, base);
        sp_4096_from_mp(e, 142, exp);
        sp_4096_from_mp(m, 142, mod);

        err = sp_4096_mod_exp_142(r, b, e, mp_count_bits(exp), m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_4096_to_mp(r, res);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 142U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }
    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[142 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 4096) {
        err = MP_READ_E;
    }
    else if (expBits > 4096) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 4096) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 142 * 4, NULL, DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 142 * 2;
        m = e + 142;
        r = b;

        sp_4096_from_mp(b, 142, base);
        sp_4096_from_mp(e, 142, exp);
        sp_4096_from_mp(m, 142, mod);

        err = sp_4096_mod_exp_142(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_4096_to_mp(r, res);
    }


#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 142U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }

    return err;
#endif
}

#ifdef WOLFSSL_HAVE_SP_DH

#ifdef HAVE_FFDHE_4096
SP_NOINLINE static void sp_4096_lshift_142(sp_digit* r, const sp_digit* a,
        byte n)
{
    int i;

    r[142] = a[141] >> (29 - n);
    for (i=141; i>0; i--) {
        r[i] = ((a[i] << n) | (a[i-1] >> (29 - n))) & 0x1fffffff;
    }
    r[0] = (a[0] << n) & 0x1fffffff;
}

/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even.
 */
static int sp_4096_mod_exp_2_142(sp_digit* r, const sp_digit* e, int bits, const sp_digit* m)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[427];
#endif
    sp_digit* norm = NULL;
    sp_digit* tmp = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit o;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 427, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        tmp  = td + 284;
        XMEMSET(td, 0, sizeof(sp_digit) * 427);

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_142(norm, m);

        bits = ((bits + 3) / 4) * 4;
        i = ((bits + 28) / 29) - 1;
        c = bits % 29;
        if (c == 0) {
            c = 29;
        }
        if (i < 142) {
            n = e[i--] << (32 - c);
        }
        else {
            n = 0;
            i--;
        }
        if (c < 4) {
            n |= e[i--] << (3 - c);
            c += 29;
        }
        y = (int)((n >> 28) & 0xf);
        n <<= 4;
        c -= 4;
        sp_4096_lshift_142(r, norm, (byte)y);
        while ((i >= 0) || (c >= 4)) {
            if (c >= 4) {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }
            else if (c == 0) {
                n = e[i--] << 3;
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c = 25;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n = e[i--] << 3;
                c = 4 - c;
                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
                n <<= c;
                c = 29 - c;
            }

            sp_4096_mont_sqr_142(r, r, m, mp);
            sp_4096_mont_sqr_142(r, r, m, mp);
            sp_4096_mont_sqr_142(r, r, m, mp);
            sp_4096_mont_sqr_142(r, r, m, mp);

            sp_4096_lshift_142(r, r, (byte)y);
            sp_4096_mul_d_142(tmp, norm, (r[142] << 22) + (r[141] >> 7));
            r[142] = 0;
            r[141] &= 0x7fL;
            (void)sp_4096_add_142(r, r, tmp);
            sp_4096_norm_142(r);
            o = sp_4096_cmp_142(r, m);
            sp_4096_cond_sub_142(r, r, m, ~(o >> 31));
        }

        sp_4096_mont_reduce_142(r, m, mp);
        n = sp_4096_cmp_142(r, m);
        sp_4096_cond_sub_142(r, r, m, ~(n >> 31));
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

#endif /* HAVE_FFDHE_4096 */

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base     Base.
 * exp      Array of bytes that is the exponent.
 * expLen   Length of data, in bytes, in exponent.
 * mod      Modulus.
 * out      Buffer to hold big-endian bytes of exponentiation result.
 *          Must be at least 512 bytes long.
 * outLen   Length, in bytes, of exponentiation result.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
    const mp_int* mod, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[142 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    word32 i;
    int err = MP_OKAY;

    if (mp_count_bits(base) > 4096) {
        err = MP_READ_E;
    }
    else if (expLen > 512U) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 4096) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 142 * 4, NULL,
            DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 142 * 2;
        m = e + 142;
        r = b;

        sp_4096_from_mp(b, 142, base);
        sp_4096_from_bin(e, 142, exp, expLen);
        sp_4096_from_mp(m, 142, mod);

    #ifdef HAVE_FFDHE_4096
        if (base->used == 1 && base->dp[0] == 2U &&
                ((m[141] << 9) | (m[140] >> 20)) == 0xffffL) {
            err = sp_4096_mod_exp_2_142(r, e, expLen * 8U, m);
        }
        else {
    #endif
            err = sp_4096_mod_exp_142(r, b, e, expLen * 8U, m, 0);
    #ifdef HAVE_FFDHE_4096
        }
    #endif
    }

    if (err == MP_OKAY) {
        sp_4096_to_bin_142(r, out);
        *outLen = 512;
        for (i=0; i<512U && out[i] == 0U; i++) {
            /* Search for first non-zero. */
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 142U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }

    return err;
}
#endif /* WOLFSSL_HAVE_SP_DH */

#endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */

#else
/* Read big endian unsigned byte array into r.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_4096_from_bin(sp_digit* r, int size, const byte* a, int n)
{
    int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = n-1; i >= 0; i--) {
        r[j] |= (((sp_digit)a[i]) << s);
        if (s >= 18U) {
            r[j] &= 0x3ffffff;
            s = 26U - s;
            if (j + 1 >= size) {
                break;
            }
            r[++j] = (sp_digit)a[i] >> s;
            s = 8U - s;
        }
        else {
            s += 8U;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  A multi-precision integer.
 */
static void sp_4096_from_mp(sp_digit* r, int size, const mp_int* a)
{
#if DIGIT_BIT == 26
    int i;
    sp_digit j = (sp_digit)0 - (sp_digit)a->used;
    int o = 0;

    for (i = 0; i < size; i++) {
        sp_digit mask = (sp_digit)0 - (j >> 25);
        r[i] = a->dp[o] & mask;
        j++;
        o += (int)(j >> 25);
    }
#elif DIGIT_BIT > 26
    unsigned int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i] << s);
        r[j] &= 0x3ffffff;
        s = 26U - s;
        if (j + 1 >= size) {
            break;
        }
        /* lint allow cast of mismatch word32 and mp_digit */
        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
        while ((s + 26U) <= (word32)DIGIT_BIT) {
            s += 26U;
            r[j] &= 0x3ffffff;
            if (j + 1 >= size) {
                break;
            }
            if (s < (word32)DIGIT_BIT) {
                /* lint allow cast of mismatch word32 and mp_digit */
                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
            }
            else {
                r[++j] = (sp_digit)0;
            }
        }
        s = (word32)DIGIT_BIT - s;
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#else
    unsigned int i;
    int j = 0;
    int s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 26) {
            r[j] &= 0x3ffffff;
            if (j + 1 >= size) {
                break;
            }
            s = 26 - s;
            if (s == DIGIT_BIT) {
                r[++j] = 0;
                s = 0;
            }
            else {
                r[++j] = a->dp[i] >> s;
                s = DIGIT_BIT - s;
            }
        }
        else {
            s += DIGIT_BIT;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#endif
}

/* Write r as big endian to byte array.
 * Fixed length number of bytes written: 512
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_4096_to_bin_162(sp_digit* r, byte* a)
{
    int i;
    int j;
    int s = 0;
    int b;

    for (i=0; i<161; i++) {
        r[i+1] += r[i] >> 26;
        r[i] &= 0x3ffffff;
    }
    j = 4103 / 8 - 1;
    a[j] = 0;
    for (i=0; i<158 && j>=0; i++) {
        b = 0;
        /* lint allow cast of mismatch sp_digit and int */
        a[j--] |= (byte)(r[i] << s); /*lint !e9033*/
        b += 8 - s;
        if (j < 0) {
            break;
        }
        while (b < 26) {
            a[j--] = (byte)(r[i] >> b);
            b += 8;
            if (j < 0) {
                break;
            }
        }
        s = 8 - (b - 26);
        if (j >= 0) {
            a[j] = 0;
        }
        if (s != 0) {
            j++;
        }
    }
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
#if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
/* Normalize the values in each word to 26 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_4096_norm_81(sp_digit* a)
{
    int i;
    for (i = 0; i < 80; i += 8) {
        a[i+1] += a[i+0] >> 26; a[i+0] &= 0x3ffffff;
        a[i+2] += a[i+1] >> 26; a[i+1] &= 0x3ffffff;
        a[i+3] += a[i+2] >> 26; a[i+2] &= 0x3ffffff;
        a[i+4] += a[i+3] >> 26; a[i+3] &= 0x3ffffff;
        a[i+5] += a[i+4] >> 26; a[i+4] &= 0x3ffffff;
        a[i+6] += a[i+5] >> 26; a[i+5] &= 0x3ffffff;
        a[i+7] += a[i+6] >> 26; a[i+6] &= 0x3ffffff;
        a[i+8] += a[i+7] >> 26; a[i+7] &= 0x3ffffff;
    }
}

#endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */
#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
/* Normalize the values in each word to 26 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_4096_norm_79(sp_digit* a)
{
    int i;
    for (i = 0; i < 72; i += 8) {
        a[i+1] += a[i+0] >> 26; a[i+0] &= 0x3ffffff;
        a[i+2] += a[i+1] >> 26; a[i+1] &= 0x3ffffff;
        a[i+3] += a[i+2] >> 26; a[i+2] &= 0x3ffffff;
        a[i+4] += a[i+3] >> 26; a[i+3] &= 0x3ffffff;
        a[i+5] += a[i+4] >> 26; a[i+4] &= 0x3ffffff;
        a[i+6] += a[i+5] >> 26; a[i+5] &= 0x3ffffff;
        a[i+7] += a[i+6] >> 26; a[i+6] &= 0x3ffffff;
        a[i+8] += a[i+7] >> 26; a[i+7] &= 0x3ffffff;
    }
    a[73] += a[72] >> 26; a[72] &= 0x3ffffff;
    a[74] += a[73] >> 26; a[73] &= 0x3ffffff;
    a[75] += a[74] >> 26; a[74] &= 0x3ffffff;
    a[76] += a[75] >> 26; a[75] &= 0x3ffffff;
    a[77] += a[76] >> 26; a[76] &= 0x3ffffff;
    a[78] += a[77] >> 26; a[77] &= 0x3ffffff;
}

/* Normalize the values in each word to 26 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_4096_norm_162(sp_digit* a)
{
    int i;
    for (i = 0; i < 160; i += 8) {
        a[i+1] += a[i+0] >> 26; a[i+0] &= 0x3ffffff;
        a[i+2] += a[i+1] >> 26; a[i+1] &= 0x3ffffff;
        a[i+3] += a[i+2] >> 26; a[i+2] &= 0x3ffffff;
        a[i+4] += a[i+3] >> 26; a[i+3] &= 0x3ffffff;
        a[i+5] += a[i+4] >> 26; a[i+4] &= 0x3ffffff;
        a[i+6] += a[i+5] >> 26; a[i+5] &= 0x3ffffff;
        a[i+7] += a[i+6] >> 26; a[i+6] &= 0x3ffffff;
        a[i+8] += a[i+7] >> 26; a[i+7] &= 0x3ffffff;
    }
    a[161] += a[160] >> 26; a[160] &= 0x3ffffff;
}

/* Normalize the values in each word to 26 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_4096_norm_158(sp_digit* a)
{
    int i;
    for (i = 0; i < 152; i += 8) {
        a[i+1] += a[i+0] >> 26; a[i+0] &= 0x3ffffff;
        a[i+2] += a[i+1] >> 26; a[i+1] &= 0x3ffffff;
        a[i+3] += a[i+2] >> 26; a[i+2] &= 0x3ffffff;
        a[i+4] += a[i+3] >> 26; a[i+3] &= 0x3ffffff;
        a[i+5] += a[i+4] >> 26; a[i+4] &= 0x3ffffff;
        a[i+6] += a[i+5] >> 26; a[i+5] &= 0x3ffffff;
        a[i+7] += a[i+6] >> 26; a[i+6] &= 0x3ffffff;
        a[i+8] += a[i+7] >> 26; a[i+7] &= 0x3ffffff;
    }
    a[153] += a[152] >> 26; a[152] &= 0x3ffffff;
    a[154] += a[153] >> 26; a[153] &= 0x3ffffff;
    a[155] += a[154] >> 26; a[154] &= 0x3ffffff;
    a[156] += a[155] >> 26; a[155] &= 0x3ffffff;
    a[157] += a[156] >> 26; a[156] &= 0x3ffffff;
}

#ifndef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_4096_mul_9(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    sp_uint64 t0;
    sp_uint64 t1;
    sp_digit t[9];

    t0 = ((sp_uint64)a[ 0]) * b[ 0];
    t1 = ((sp_uint64)a[ 0]) * b[ 1]
       + ((sp_uint64)a[ 1]) * b[ 0];
    t[ 0] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_uint64)a[ 0]) * b[ 2]
       + ((sp_uint64)a[ 1]) * b[ 1]
       + ((sp_uint64)a[ 2]) * b[ 0];
    t[ 1] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_uint64)a[ 0]) * b[ 3]
       + ((sp_uint64)a[ 1]) * b[ 2]
       + ((sp_uint64)a[ 2]) * b[ 1]
       + ((sp_uint64)a[ 3]) * b[ 0];
    t[ 2] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_uint64)a[ 0]) * b[ 4]
       + ((sp_uint64)a[ 1]) * b[ 3]
       + ((sp_uint64)a[ 2]) * b[ 2]
       + ((sp_uint64)a[ 3]) * b[ 1]
       + ((sp_uint64)a[ 4]) * b[ 0];
    t[ 3] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_uint64)a[ 0]) * b[ 5]
       + ((sp_uint64)a[ 1]) * b[ 4]
       + ((sp_uint64)a[ 2]) * b[ 3]
       + ((sp_uint64)a[ 3]) * b[ 2]
       + ((sp_uint64)a[ 4]) * b[ 1]
       + ((sp_uint64)a[ 5]) * b[ 0];
    t[ 4] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_uint64)a[ 0]) * b[ 6]
       + ((sp_uint64)a[ 1]) * b[ 5]
       + ((sp_uint64)a[ 2]) * b[ 4]
       + ((sp_uint64)a[ 3]) * b[ 3]
       + ((sp_uint64)a[ 4]) * b[ 2]
       + ((sp_uint64)a[ 5]) * b[ 1]
       + ((sp_uint64)a[ 6]) * b[ 0];
    t[ 5] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_uint64)a[ 0]) * b[ 7]
       + ((sp_uint64)a[ 1]) * b[ 6]
       + ((sp_uint64)a[ 2]) * b[ 5]
       + ((sp_uint64)a[ 3]) * b[ 4]
       + ((sp_uint64)a[ 4]) * b[ 3]
       + ((sp_uint64)a[ 5]) * b[ 2]
       + ((sp_uint64)a[ 6]) * b[ 1]
       + ((sp_uint64)a[ 7]) * b[ 0];
    t[ 6] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_uint64)a[ 0]) * b[ 8]
       + ((sp_uint64)a[ 1]) * b[ 7]
       + ((sp_uint64)a[ 2]) * b[ 6]
       + ((sp_uint64)a[ 3]) * b[ 5]
       + ((sp_uint64)a[ 4]) * b[ 4]
       + ((sp_uint64)a[ 5]) * b[ 3]
       + ((sp_uint64)a[ 6]) * b[ 2]
       + ((sp_uint64)a[ 7]) * b[ 1]
       + ((sp_uint64)a[ 8]) * b[ 0];
    t[ 7] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_uint64)a[ 1]) * b[ 8]
       + ((sp_uint64)a[ 2]) * b[ 7]
       + ((sp_uint64)a[ 3]) * b[ 6]
       + ((sp_uint64)a[ 4]) * b[ 5]
       + ((sp_uint64)a[ 5]) * b[ 4]
       + ((sp_uint64)a[ 6]) * b[ 3]
       + ((sp_uint64)a[ 7]) * b[ 2]
       + ((sp_uint64)a[ 8]) * b[ 1];
    t[ 8] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_uint64)a[ 2]) * b[ 8]
       + ((sp_uint64)a[ 3]) * b[ 7]
       + ((sp_uint64)a[ 4]) * b[ 6]
       + ((sp_uint64)a[ 5]) * b[ 5]
       + ((sp_uint64)a[ 6]) * b[ 4]
       + ((sp_uint64)a[ 7]) * b[ 3]
       + ((sp_uint64)a[ 8]) * b[ 2];
    r[ 9] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_uint64)a[ 3]) * b[ 8]
       + ((sp_uint64)a[ 4]) * b[ 7]
       + ((sp_uint64)a[ 5]) * b[ 6]
       + ((sp_uint64)a[ 6]) * b[ 5]
       + ((sp_uint64)a[ 7]) * b[ 4]
       + ((sp_uint64)a[ 8]) * b[ 3];
    r[10] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_uint64)a[ 4]) * b[ 8]
       + ((sp_uint64)a[ 5]) * b[ 7]
       + ((sp_uint64)a[ 6]) * b[ 6]
       + ((sp_uint64)a[ 7]) * b[ 5]
       + ((sp_uint64)a[ 8]) * b[ 4];
    r[11] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_uint64)a[ 5]) * b[ 8]
       + ((sp_uint64)a[ 6]) * b[ 7]
       + ((sp_uint64)a[ 7]) * b[ 6]
       + ((sp_uint64)a[ 8]) * b[ 5];
    r[12] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_uint64)a[ 6]) * b[ 8]
       + ((sp_uint64)a[ 7]) * b[ 7]
       + ((sp_uint64)a[ 8]) * b[ 6];
    r[13] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_uint64)a[ 7]) * b[ 8]
       + ((sp_uint64)a[ 8]) * b[ 7];
    r[14] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_uint64)a[ 8]) * b[ 8];
    r[15] = t1 & 0x3ffffff; t0 += t1 >> 26;
    r[16] = t0 & 0x3ffffff;
    r[17] = (sp_digit)(t0 >> 26);
    XMEMCPY(r, t, sizeof(t));
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_4096_add_9(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    r[ 0] = a[ 0] + b[ 0];
    r[ 1] = a[ 1] + b[ 1];
    r[ 2] = a[ 2] + b[ 2];
    r[ 3] = a[ 3] + b[ 3];
    r[ 4] = a[ 4] + b[ 4];
    r[ 5] = a[ 5] + b[ 5];
    r[ 6] = a[ 6] + b[ 6];
    r[ 7] = a[ 7] + b[ 7];
    r[ 8] = a[ 8] + b[ 8];

    return 0;
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_4096_sub_18(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 16; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }
    r[16] = a[16] - b[16];
    r[17] = a[17] - b[17];

    return 0;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_4096_add_18(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 16; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[16] = a[16] + b[16];
    r[17] = a[17] + b[17];

    return 0;
}

/* Normalize the values in each word to 26 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_4096_norm_9(sp_digit* a)
{
    a[1] += a[0] >> 26; a[0] &= 0x3ffffff;
    a[2] += a[1] >> 26; a[1] &= 0x3ffffff;
    a[3] += a[2] >> 26; a[2] &= 0x3ffffff;
    a[4] += a[3] >> 26; a[3] &= 0x3ffffff;
    a[5] += a[4] >> 26; a[4] &= 0x3ffffff;
    a[6] += a[5] >> 26; a[5] &= 0x3ffffff;
    a[7] += a[6] >> 26; a[6] &= 0x3ffffff;
    a[8] += a[7] >> 26; a[7] &= 0x3ffffff;
}

/* Normalize the values in each word to 26 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_4096_norm_18(sp_digit* a)
{
    int i;
    for (i = 0; i < 16; i += 8) {
        a[i+1] += a[i+0] >> 26; a[i+0] &= 0x3ffffff;
        a[i+2] += a[i+1] >> 26; a[i+1] &= 0x3ffffff;
        a[i+3] += a[i+2] >> 26; a[i+2] &= 0x3ffffff;
        a[i+4] += a[i+3] >> 26; a[i+3] &= 0x3ffffff;
        a[i+5] += a[i+4] >> 26; a[i+4] &= 0x3ffffff;
        a[i+6] += a[i+5] >> 26; a[i+5] &= 0x3ffffff;
        a[i+7] += a[i+6] >> 26; a[i+6] &= 0x3ffffff;
        a[i+8] += a[i+7] >> 26; a[i+7] &= 0x3ffffff;
    }
    a[17] += a[16] >> 26; a[16] &= 0x3ffffff;
}

/* Normalize the values in each word to 26 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_4096_norm_54(sp_digit* a)
{
    int i;
    for (i = 0; i < 48; i += 8) {
        a[i+1] += a[i+0] >> 26; a[i+0] &= 0x3ffffff;
        a[i+2] += a[i+1] >> 26; a[i+1] &= 0x3ffffff;
        a[i+3] += a[i+2] >> 26; a[i+2] &= 0x3ffffff;
        a[i+4] += a[i+3] >> 26; a[i+3] &= 0x3ffffff;
        a[i+5] += a[i+4] >> 26; a[i+4] &= 0x3ffffff;
        a[i+6] += a[i+5] >> 26; a[i+5] &= 0x3ffffff;
        a[i+7] += a[i+6] >> 26; a[i+6] &= 0x3ffffff;
        a[i+8] += a[i+7] >> 26; a[i+7] &= 0x3ffffff;
    }
    a[49] += a[48] >> 26; a[48] &= 0x3ffffff;
    a[50] += a[49] >> 26; a[49] &= 0x3ffffff;
    a[51] += a[50] >> 26; a[50] &= 0x3ffffff;
    a[52] += a[51] >> 26; a[51] &= 0x3ffffff;
    a[53] += a[52] >> 26; a[52] &= 0x3ffffff;
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_4096_mul_27(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    sp_digit p0[18];
    sp_digit p1[18];
    sp_digit p2[18];
    sp_digit p3[18];
    sp_digit p4[18];
    sp_digit p5[18];
    sp_digit t0[18];
    sp_digit t1[18];
    sp_digit t2[18];
    sp_digit a0[9];
    sp_digit a1[9];
    sp_digit a2[9];
    sp_digit b0[9];
    sp_digit b1[9];
    sp_digit b2[9];
    (void)sp_4096_add_9(a0, a, &a[9]);
    sp_4096_norm_9(a0);
    (void)sp_4096_add_9(b0, b, &b[9]);
    sp_4096_norm_9(b0);
    (void)sp_4096_add_9(a1, &a[9], &a[18]);
    sp_4096_norm_9(a1);
    (void)sp_4096_add_9(b1, &b[9], &b[18]);
    sp_4096_norm_9(b1);
    (void)sp_4096_add_9(a2, a0, &a[18]);
    sp_4096_norm_9(a1);
    (void)sp_4096_add_9(b2, b0, &b[18]);
    sp_4096_norm_9(b2);
    sp_4096_mul_9(p0, a, b);
    sp_4096_mul_9(p2, &a[9], &b[9]);
    sp_4096_mul_9(p4, &a[18], &b[18]);
    sp_4096_mul_9(p1, a0, b0);
    sp_4096_mul_9(p3, a1, b1);
    sp_4096_mul_9(p5, a2, b2);
    XMEMSET(r, 0, sizeof(*r)*2U*27U);
    (void)sp_4096_sub_18(t0, p3, p2);
    (void)sp_4096_sub_18(t1, p1, p2);
    (void)sp_4096_sub_18(t2, p5, t0);
    (void)sp_4096_sub_18(t2, t2, t1);
    sp_4096_norm_18(t2);
    (void)sp_4096_sub_18(t0, t0, p4);
    sp_4096_norm_18(t0);
    (void)sp_4096_sub_18(t1, t1, p0);
    sp_4096_norm_18(t1);
    (void)sp_4096_add_18(r, r, p0);
    (void)sp_4096_add_18(&r[9], &r[9], t1);
    (void)sp_4096_add_18(&r[18], &r[18], t2);
    (void)sp_4096_add_18(&r[27], &r[27], t0);
    (void)sp_4096_add_18(&r[36], &r[36], p4);
    sp_4096_norm_54(r);
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_4096_add_27(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 24; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[24] = a[24] + b[24];
    r[25] = a[25] + b[25];
    r[26] = a[26] + b[26];

    return 0;
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_4096_sub_54(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }
    r[48] = a[48] - b[48];
    r[49] = a[49] - b[49];
    r[50] = a[50] - b[50];
    r[51] = a[51] - b[51];
    r[52] = a[52] - b[52];
    r[53] = a[53] - b[53];

    return 0;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_4096_add_54(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[48] = a[48] + b[48];
    r[49] = a[49] + b[49];
    r[50] = a[50] + b[50];
    r[51] = a[51] + b[51];
    r[52] = a[52] + b[52];
    r[53] = a[53] + b[53];

    return 0;
}

/* Normalize the values in each word to 26 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_4096_norm_27(sp_digit* a)
{
    int i;
    for (i = 0; i < 24; i += 8) {
        a[i+1] += a[i+0] >> 26; a[i+0] &= 0x3ffffff;
        a[i+2] += a[i+1] >> 26; a[i+1] &= 0x3ffffff;
        a[i+3] += a[i+2] >> 26; a[i+2] &= 0x3ffffff;
        a[i+4] += a[i+3] >> 26; a[i+3] &= 0x3ffffff;
        a[i+5] += a[i+4] >> 26; a[i+4] &= 0x3ffffff;
        a[i+6] += a[i+5] >> 26; a[i+5] &= 0x3ffffff;
        a[i+7] += a[i+6] >> 26; a[i+6] &= 0x3ffffff;
        a[i+8] += a[i+7] >> 26; a[i+7] &= 0x3ffffff;
    }
    a[25] += a[24] >> 26; a[24] &= 0x3ffffff;
    a[26] += a[25] >> 26; a[25] &= 0x3ffffff;
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_4096_mul_81(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    sp_digit p0[54];
    sp_digit p1[54];
    sp_digit p2[54];
    sp_digit p3[54];
    sp_digit p4[54];
    sp_digit p5[54];
    sp_digit t0[54];
    sp_digit t1[54];
    sp_digit t2[54];
    sp_digit a0[27];
    sp_digit a1[27];
    sp_digit a2[27];
    sp_digit b0[27];
    sp_digit b1[27];
    sp_digit b2[27];
    (void)sp_4096_add_27(a0, a, &a[27]);
    sp_4096_norm_27(a0);
    (void)sp_4096_add_27(b0, b, &b[27]);
    sp_4096_norm_27(b0);
    (void)sp_4096_add_27(a1, &a[27], &a[54]);
    sp_4096_norm_27(a1);
    (void)sp_4096_add_27(b1, &b[27], &b[54]);
    sp_4096_norm_27(b1);
    (void)sp_4096_add_27(a2, a0, &a[54]);
    sp_4096_norm_27(a1);
    (void)sp_4096_add_27(b2, b0, &b[54]);
    sp_4096_norm_27(b2);
    sp_4096_mul_27(p0, a, b);
    sp_4096_mul_27(p2, &a[27], &b[27]);
    sp_4096_mul_27(p4, &a[54], &b[54]);
    sp_4096_mul_27(p1, a0, b0);
    sp_4096_mul_27(p3, a1, b1);
    sp_4096_mul_27(p5, a2, b2);
    XMEMSET(r, 0, sizeof(*r)*2U*81U);
    (void)sp_4096_sub_54(t0, p3, p2);
    (void)sp_4096_sub_54(t1, p1, p2);
    (void)sp_4096_sub_54(t2, p5, t0);
    (void)sp_4096_sub_54(t2, t2, t1);
    sp_4096_norm_54(t2);
    (void)sp_4096_sub_54(t0, t0, p4);
    sp_4096_norm_54(t0);
    (void)sp_4096_sub_54(t1, t1, p0);
    sp_4096_norm_54(t1);
    (void)sp_4096_add_54(r, r, p0);
    (void)sp_4096_add_54(&r[27], &r[27], t1);
    (void)sp_4096_add_54(&r[54], &r[54], t2);
    (void)sp_4096_add_54(&r[81], &r[81], t0);
    (void)sp_4096_add_54(&r[108], &r[108], p4);
    sp_4096_norm_162(r);
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_4096_add_81(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 80; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[80] = a[80] + b[80];

    return 0;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_4096_add_162(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 160; i += 8) {
        r[i + 0] = a[i + 0] + b[i + 0];
        r[i + 1] = a[i + 1] + b[i + 1];
        r[i + 2] = a[i + 2] + b[i + 2];
        r[i + 3] = a[i + 3] + b[i + 3];
        r[i + 4] = a[i + 4] + b[i + 4];
        r[i + 5] = a[i + 5] + b[i + 5];
        r[i + 6] = a[i + 6] + b[i + 6];
        r[i + 7] = a[i + 7] + b[i + 7];
    }
    r[160] = a[160] + b[160];
    r[161] = a[161] + b[161];

    return 0;
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_4096_sub_162(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 160; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }
    r[160] = a[160] - b[160];
    r[161] = a[161] - b[161];

    return 0;
}

/* Normalize the values in each word to 26 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_4096_norm_324(sp_digit* a)
{
    int i;
    for (i = 0; i < 320; i += 8) {
        a[i+1] += a[i+0] >> 26; a[i+0] &= 0x3ffffff;
        a[i+2] += a[i+1] >> 26; a[i+1] &= 0x3ffffff;
        a[i+3] += a[i+2] >> 26; a[i+2] &= 0x3ffffff;
        a[i+4] += a[i+3] >> 26; a[i+3] &= 0x3ffffff;
        a[i+5] += a[i+4] >> 26; a[i+4] &= 0x3ffffff;
        a[i+6] += a[i+5] >> 26; a[i+5] &= 0x3ffffff;
        a[i+7] += a[i+6] >> 26; a[i+6] &= 0x3ffffff;
        a[i+8] += a[i+7] >> 26; a[i+7] &= 0x3ffffff;
    }
    a[321] += a[320] >> 26; a[320] &= 0x3ffffff;
    a[322] += a[321] >> 26; a[321] &= 0x3ffffff;
    a[323] += a[322] >> 26; a[322] &= 0x3ffffff;
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_4096_mul_162(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[162];
    sp_digit* a1 = z1;
    sp_digit b1[81];
    sp_digit* z2 = r + 162;
    (void)sp_4096_add_81(a1, a, &a[81]);
    sp_4096_norm_81(a1);
    (void)sp_4096_add_81(b1, b, &b[81]);
    sp_4096_norm_81(b1);
    sp_4096_mul_81(z2, &a[81], &b[81]);
    sp_4096_mul_81(z0, a, b);
    sp_4096_mul_81(z1, a1, b1);
    (void)sp_4096_sub_162(z1, z1, z2);
    (void)sp_4096_sub_162(z1, z1, z0);
    (void)sp_4096_add_162(r + 81, r + 81, z1);
    sp_4096_norm_324(r);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_4096_sqr_9(sp_digit* r, const sp_digit* a)
{
    sp_uint64 t0;
    sp_uint64 t1;
    sp_digit t[9];

    t0 =  ((sp_uint64)a[ 0]) * a[ 0];
    t1 = (((sp_uint64)a[ 0]) * a[ 1]) * 2;
    t[ 0] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_uint64)a[ 0]) * a[ 2]) * 2
       +  ((sp_uint64)a[ 1]) * a[ 1];
    t[ 1] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_uint64)a[ 0]) * a[ 3]
       +  ((sp_uint64)a[ 1]) * a[ 2]) * 2;
    t[ 2] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_uint64)a[ 0]) * a[ 4]
       +  ((sp_uint64)a[ 1]) * a[ 3]) * 2
       +  ((sp_uint64)a[ 2]) * a[ 2];
    t[ 3] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_uint64)a[ 0]) * a[ 5]
       +  ((sp_uint64)a[ 1]) * a[ 4]
       +  ((sp_uint64)a[ 2]) * a[ 3]) * 2;
    t[ 4] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_uint64)a[ 0]) * a[ 6]
       +  ((sp_uint64)a[ 1]) * a[ 5]
       +  ((sp_uint64)a[ 2]) * a[ 4]) * 2
       +  ((sp_uint64)a[ 3]) * a[ 3];
    t[ 5] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_uint64)a[ 0]) * a[ 7]
       +  ((sp_uint64)a[ 1]) * a[ 6]
       +  ((sp_uint64)a[ 2]) * a[ 5]
       +  ((sp_uint64)a[ 3]) * a[ 4]) * 2;
    t[ 6] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_uint64)a[ 0]) * a[ 8]
       +  ((sp_uint64)a[ 1]) * a[ 7]
       +  ((sp_uint64)a[ 2]) * a[ 6]
       +  ((sp_uint64)a[ 3]) * a[ 5]) * 2
       +  ((sp_uint64)a[ 4]) * a[ 4];
    t[ 7] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_uint64)a[ 1]) * a[ 8]
       +  ((sp_uint64)a[ 2]) * a[ 7]
       +  ((sp_uint64)a[ 3]) * a[ 6]
       +  ((sp_uint64)a[ 4]) * a[ 5]) * 2;
    t[ 8] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_uint64)a[ 2]) * a[ 8]
       +  ((sp_uint64)a[ 3]) * a[ 7]
       +  ((sp_uint64)a[ 4]) * a[ 6]) * 2
       +  ((sp_uint64)a[ 5]) * a[ 5];
    r[ 9] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_uint64)a[ 3]) * a[ 8]
       +  ((sp_uint64)a[ 4]) * a[ 7]
       +  ((sp_uint64)a[ 5]) * a[ 6]) * 2;
    r[10] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_uint64)a[ 4]) * a[ 8]
       +  ((sp_uint64)a[ 5]) * a[ 7]) * 2
       +  ((sp_uint64)a[ 6]) * a[ 6];
    r[11] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_uint64)a[ 5]) * a[ 8]
       +  ((sp_uint64)a[ 6]) * a[ 7]) * 2;
    r[12] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_uint64)a[ 6]) * a[ 8]) * 2
       +  ((sp_uint64)a[ 7]) * a[ 7];
    r[13] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_uint64)a[ 7]) * a[ 8]) * 2;
    r[14] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 =  ((sp_uint64)a[ 8]) * a[ 8];
    r[15] = t1 & 0x3ffffff; t0 += t1 >> 26;
    r[16] = t0 & 0x3ffffff;
    r[17] = (sp_digit)(t0 >> 26);
    XMEMCPY(r, t, sizeof(t));
}

/* Square a into r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_4096_sqr_27(sp_digit* r, const sp_digit* a)
{
    sp_digit p0[18];
    sp_digit p1[18];
    sp_digit p2[18];
    sp_digit p3[18];
    sp_digit p4[18];
    sp_digit p5[18];
    sp_digit t0[18];
    sp_digit t1[18];
    sp_digit t2[18];
    sp_digit a0[9];
    sp_digit a1[9];
    sp_digit a2[9];
    (void)sp_4096_add_9(a0, a, &a[9]);
    sp_4096_norm_9(a0);
    (void)sp_4096_add_9(a1, &a[9], &a[18]);
    sp_4096_norm_9(a1);
    (void)sp_4096_add_9(a2, a0, &a[18]);
    sp_4096_norm_9(a2);
    sp_4096_sqr_9(p0, a);
    sp_4096_sqr_9(p2, &a[9]);
    sp_4096_sqr_9(p4, &a[18]);
    sp_4096_sqr_9(p1, a0);
    sp_4096_sqr_9(p3, a1);
    sp_4096_sqr_9(p5, a2);
    XMEMSET(r, 0, sizeof(*r)*2U*27U);
    (void)sp_4096_sub_18(t0, p3, p2);
    (void)sp_4096_sub_18(t1, p1, p2);
    (void)sp_4096_sub_18(t2, p5, t0);
    (void)sp_4096_sub_18(t2, t2, t1);
    sp_4096_norm_18(t2);
    (void)sp_4096_sub_18(t0, t0, p4);
    sp_4096_norm_18(t0);
    (void)sp_4096_sub_18(t1, t1, p0);
    sp_4096_norm_18(t1);
    (void)sp_4096_add_18(r, r, p0);
    (void)sp_4096_add_18(&r[9], &r[9], t1);
    (void)sp_4096_add_18(&r[18], &r[18], t2);
    (void)sp_4096_add_18(&r[27], &r[27], t0);
    (void)sp_4096_add_18(&r[36], &r[36], p4);
    sp_4096_norm_54(r);
}

/* Square a into r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_4096_sqr_81(sp_digit* r, const sp_digit* a)
{
    sp_digit p0[54];
    sp_digit p1[54];
    sp_digit p2[54];
    sp_digit p3[54];
    sp_digit p4[54];
    sp_digit p5[54];
    sp_digit t0[54];
    sp_digit t1[54];
    sp_digit t2[54];
    sp_digit a0[27];
    sp_digit a1[27];
    sp_digit a2[27];
    (void)sp_4096_add_27(a0, a, &a[27]);
    sp_4096_norm_27(a0);
    (void)sp_4096_add_27(a1, &a[27], &a[54]);
    sp_4096_norm_27(a1);
    (void)sp_4096_add_27(a2, a0, &a[54]);
    sp_4096_norm_27(a2);
    sp_4096_sqr_27(p0, a);
    sp_4096_sqr_27(p2, &a[27]);
    sp_4096_sqr_27(p4, &a[54]);
    sp_4096_sqr_27(p1, a0);
    sp_4096_sqr_27(p3, a1);
    sp_4096_sqr_27(p5, a2);
    XMEMSET(r, 0, sizeof(*r)*2U*81U);
    (void)sp_4096_sub_54(t0, p3, p2);
    (void)sp_4096_sub_54(t1, p1, p2);
    (void)sp_4096_sub_54(t2, p5, t0);
    (void)sp_4096_sub_54(t2, t2, t1);
    sp_4096_norm_54(t2);
    (void)sp_4096_sub_54(t0, t0, p4);
    sp_4096_norm_54(t0);
    (void)sp_4096_sub_54(t1, t1, p0);
    sp_4096_norm_54(t1);
    (void)sp_4096_add_54(r, r, p0);
    (void)sp_4096_add_54(&r[27], &r[27], t1);
    (void)sp_4096_add_54(&r[54], &r[54], t2);
    (void)sp_4096_add_54(&r[81], &r[81], t0);
    (void)sp_4096_add_54(&r[108], &r[108], p4);
    sp_4096_norm_162(r);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_4096_sqr_162(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit z1[162];
    sp_digit* a1 = z1;
    sp_digit* z2 = r + 162;
    (void)sp_4096_add_81(a1, a, &a[81]);
    sp_4096_norm_81(a1);
    sp_4096_sqr_81(z2, &a[81]);
    sp_4096_sqr_81(z0, a);
    sp_4096_sqr_81(z1, a1);
    (void)sp_4096_sub_162(z1, z1, z2);
    (void)sp_4096_sub_162(z1, z1, z0);
    (void)sp_4096_add_162(r + 81, r + 81, z1);
    sp_4096_norm_324(r);
}

#endif /* !WOLFSSL_SP_SMALL */
/* Calculate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_4096_mont_setup(const sp_digit* a, sp_digit* rho)
{
    sp_digit x;
    sp_digit b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
    x &= 0x3ffffff;

    /* rho = -1/m mod b */
    *rho = ((sp_digit)1 << 26) - x;
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_4096_mul_d_162(sp_digit* r, const sp_digit* a,
    sp_digit b)
{
    sp_int64 tb = b;
    sp_int64 t = 0;
    sp_digit t2;
    sp_int64 p[4];
    int i;

    for (i = 0; i < 160; i += 4) {
        p[0] = tb * a[i + 0];
        p[1] = tb * a[i + 1];
        p[2] = tb * a[i + 2];
        p[3] = tb * a[i + 3];
        t += p[0];
        t2 = (sp_digit)(t & 0x3ffffff);
        t >>= 26;
        r[i + 0] = (sp_digit)t2;
        t += p[1];
        t2 = (sp_digit)(t & 0x3ffffff);
        t >>= 26;
        r[i + 1] = (sp_digit)t2;
        t += p[2];
        t2 = (sp_digit)(t & 0x3ffffff);
        t >>= 26;
        r[i + 2] = (sp_digit)t2;
        t += p[3];
        t2 = (sp_digit)(t & 0x3ffffff);
        t >>= 26;
        r[i + 3] = (sp_digit)t2;
    }
    t += tb * a[160];
    r[160] = (sp_digit)(t & 0x3ffffff);
    t >>= 26;
    t += tb * a[161];
    r[161] = (sp_digit)(t & 0x3ffffff);
    t >>= 26;
    r[162] = (sp_digit)(t & 0x3ffffff);
}

#if (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)
#if defined(WOLFSSL_HAVE_SP_RSA) && !defined(SP_RSA_PRIVATE_EXP_D)
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_4096_sub_81(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 80; i += 8) {
        r[i + 0] = a[i + 0] - b[i + 0];
        r[i + 1] = a[i + 1] - b[i + 1];
        r[i + 2] = a[i + 2] - b[i + 2];
        r[i + 3] = a[i + 3] - b[i + 3];
        r[i + 4] = a[i + 4] - b[i + 4];
        r[i + 5] = a[i + 5] - b[i + 5];
        r[i + 6] = a[i + 6] - b[i + 6];
        r[i + 7] = a[i + 7] - b[i + 7];
    }
    r[80] = a[80] - b[80];

    return 0;
}

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 4096 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_4096_mont_norm_81(sp_digit* r, const sp_digit* m)
{
    /* Set r = 2^n - 1. */
    int i;

    for (i = 0; i < 72; i += 8) {
        r[i + 0] = 0x3ffffff;
        r[i + 1] = 0x3ffffff;
        r[i + 2] = 0x3ffffff;
        r[i + 3] = 0x3ffffff;
        r[i + 4] = 0x3ffffff;
        r[i + 5] = 0x3ffffff;
        r[i + 6] = 0x3ffffff;
        r[i + 7] = 0x3ffffff;
    }
    r[72] = 0x3ffffff;
    r[73] = 0x3ffffff;
    r[74] = 0x3ffffff;
    r[75] = 0x3ffffff;
    r[76] = 0x3ffffff;
    r[77] = 0x3ffffff;
    r[78] = 0xfffffL;
    r[79] = 0;
    r[80] = 0;

    /* r = (2^n - 1) mod n */
    (void)sp_4096_sub_81(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_4096_cmp_81(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
    int i;

    r |= (a[80] - b[80]) & (0 - (sp_digit)1);
    for (i = 72; i >= 0; i -= 8) {
        r |= (a[i + 7] - b[i + 7]) & ~(((sp_digit)0 - r) >> 25);
        r |= (a[i + 6] - b[i + 6]) & ~(((sp_digit)0 - r) >> 25);
        r |= (a[i + 5] - b[i + 5]) & ~(((sp_digit)0 - r) >> 25);
        r |= (a[i + 4] - b[i + 4]) & ~(((sp_digit)0 - r) >> 25);
        r |= (a[i + 3] - b[i + 3]) & ~(((sp_digit)0 - r) >> 25);
        r |= (a[i + 2] - b[i + 2]) & ~(((sp_digit)0 - r) >> 25);
        r |= (a[i + 1] - b[i + 1]) & ~(((sp_digit)0 - r) >> 25);
        r |= (a[i + 0] - b[i + 0]) & ~(((sp_digit)0 - r) >> 25);
    }

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_4096_cond_sub_81(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 80; i += 8) {
        r[i + 0] = a[i + 0] - (b[i + 0] & m);
        r[i + 1] = a[i + 1] - (b[i + 1] & m);
        r[i + 2] = a[i + 2] - (b[i + 2] & m);
        r[i + 3] = a[i + 3] - (b[i + 3] & m);
        r[i + 4] = a[i + 4] - (b[i + 4] & m);
        r[i + 5] = a[i + 5] - (b[i + 5] & m);
        r[i + 6] = a[i + 6] - (b[i + 6] & m);
        r[i + 7] = a[i + 7] - (b[i + 7] & m);
    }
    r[80] = a[80] - (b[80] & m);
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_4096_mul_add_81(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifndef WOLFSSL_SP_LARGE_CODE
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 81; i++) {
        t += r[i];
        t += tb * a[i];
        r[i] = ((sp_digit)t) & 0x3ffffff;
        t >>= 26;
    }
    r[81] += (sp_digit)t;
#else
    sp_int64 tb = b;
    sp_int64 t[8];
    int i;

    t[0] = 0;
    for (i = 0; i < 80; i += 8) {
        t[0] += (tb * a[i+0]) + r[i+0];
        t[1]  = (tb * a[i+1]) + r[i+1];
        t[2]  = (tb * a[i+2]) + r[i+2];
        t[3]  = (tb * a[i+3]) + r[i+3];
        t[4]  = (tb * a[i+4]) + r[i+4];
        t[5]  = (tb * a[i+5]) + r[i+5];
        t[6]  = (tb * a[i+6]) + r[i+6];
        t[7]  = (tb * a[i+7]) + r[i+7];
        r[i+0] = t[0] & 0x3ffffff;
        t[1] += t[0] >> 26;
        r[i+1] = t[1] & 0x3ffffff;
        t[2] += t[1] >> 26;
        r[i+2] = t[2] & 0x3ffffff;
        t[3] += t[2] >> 26;
        r[i+3] = t[3] & 0x3ffffff;
        t[4] += t[3] >> 26;
        r[i+4] = t[4] & 0x3ffffff;
        t[5] += t[4] >> 26;
        r[i+5] = t[5] & 0x3ffffff;
        t[6] += t[5] >> 26;
        r[i+6] = t[6] & 0x3ffffff;
        t[7] += t[6] >> 26;
        r[i+7] = t[7] & 0x3ffffff;
        t[0]  = t[7] >> 26;
    }
    t[0] += (tb * a[80]) + r[80];
    r[80] = t[0] & 0x3ffffff;
    r[81] +=  (sp_digit)(t[0] >> 26);
#endif /* !WOLFSSL_SP_LARGE_CODE */
}

/* Shift the result in the high 2048 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_4096_mont_shift_81(sp_digit* r, const sp_digit* a)
{
    int i;
    sp_int64 n = a[78] >> 20;
    n += ((sp_int64)a[79]) << 6;
    for (i = 0; i < 72; i += 8) {
        r[i + 0] = n & 0x3ffffff;
        n >>= 26; n += ((sp_int64)a[i + 80]) << 6;
        r[i + 1] = n & 0x3ffffff;
        n >>= 26; n += ((sp_int64)a[i + 81]) << 6;
        r[i + 2] = n & 0x3ffffff;
        n >>= 26; n += ((sp_int64)a[i + 82]) << 6;
        r[i + 3] = n & 0x3ffffff;
        n >>= 26; n += ((sp_int64)a[i + 83]) << 6;
        r[i + 4] = n & 0x3ffffff;
        n >>= 26; n += ((sp_int64)a[i + 84]) << 6;
        r[i + 5] = n & 0x3ffffff;
        n >>= 26; n += ((sp_int64)a[i + 85]) << 6;
        r[i + 6] = n & 0x3ffffff;
        n >>= 26; n += ((sp_int64)a[i + 86]) << 6;
        r[i + 7] = n & 0x3ffffff;
        n >>= 26; n += ((sp_int64)a[i + 87]) << 6;
    }
    r[72] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[152]) << 6;
    r[73] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[153]) << 6;
    r[74] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[154]) << 6;
    r[75] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[155]) << 6;
    r[76] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[156]) << 6;
    r[77] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[157]) << 6;
    r[78] = (sp_digit)n;
    XMEMSET(&r[79], 0, sizeof(*r) * 79U);
}

/* Reduce the number back to 4096 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_4096_mont_reduce_81(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;
    sp_digit over;

    sp_4096_norm_81(a + 79);

    for (i=0; i<78; i++) {
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x3ffffff;
        sp_4096_mul_add_81(a+i, m, mu);
        a[i+1] += a[i] >> 26;
    }
    mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0xfffffL;
    sp_4096_mul_add_81(a+i, m, mu);
    a[i+1] += a[i] >> 26;
    a[i] &= 0x3ffffff;
    sp_4096_mont_shift_81(a, a);
    over = a[78] - m[78];
    sp_4096_cond_sub_81(a, a, m, ~((over - 1) >> 31));
    sp_4096_norm_81(a);
}

/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_4096_mont_mul_81(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_4096_mul_81(r, a, b);
    sp_4096_mont_reduce_81(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_4096_mont_sqr_81(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_4096_sqr_81(r, a);
    sp_4096_mont_reduce_81(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_4096_mul_d_81(sp_digit* r, const sp_digit* a,
    sp_digit b)
{
    sp_int64 tb = b;
    sp_int64 t = 0;
    sp_digit t2;
    sp_int64 p[4];
    int i;

    for (i = 0; i < 80; i += 4) {
        p[0] = tb * a[i + 0];
        p[1] = tb * a[i + 1];
        p[2] = tb * a[i + 2];
        p[3] = tb * a[i + 3];
        t += p[0];
        t2 = (sp_digit)(t & 0x3ffffff);
        t >>= 26;
        r[i + 0] = (sp_digit)t2;
        t += p[1];
        t2 = (sp_digit)(t & 0x3ffffff);
        t >>= 26;
        r[i + 1] = (sp_digit)t2;
        t += p[2];
        t2 = (sp_digit)(t & 0x3ffffff);
        t >>= 26;
        r[i + 2] = (sp_digit)t2;
        t += p[3];
        t2 = (sp_digit)(t & 0x3ffffff);
        t >>= 26;
        r[i + 3] = (sp_digit)t2;
    }
    t += tb * a[80];
    r[80] = (sp_digit)(t & 0x3ffffff);
    t >>= 26;
    r[81] = (sp_digit)(t & 0x3ffffff);
}

#ifndef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_4096_cond_add_81(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 80; i += 8) {
        r[i + 0] = a[i + 0] + (b[i + 0] & m);
        r[i + 1] = a[i + 1] + (b[i + 1] & m);
        r[i + 2] = a[i + 2] + (b[i + 2] & m);
        r[i + 3] = a[i + 3] + (b[i + 3] & m);
        r[i + 4] = a[i + 4] + (b[i + 4] & m);
        r[i + 5] = a[i + 5] + (b[i + 5] & m);
        r[i + 6] = a[i + 6] + (b[i + 6] & m);
        r[i + 7] = a[i + 7] + (b[i + 7] & m);
    }
    r[80] = a[80] + (b[80] & m);
}
#endif /* !WOLFSSL_SP_SMALL */

SP_NOINLINE static void sp_4096_rshift_81(sp_digit* r, const sp_digit* a,
        byte n)
{
    int i;

    for (i=0; i<80; i += 8) {
        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (26 - n)) & 0x3ffffff);
        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (26 - n)) & 0x3ffffff);
        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (26 - n)) & 0x3ffffff);
        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (26 - n)) & 0x3ffffff);
        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (26 - n)) & 0x3ffffff);
        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (26 - n)) & 0x3ffffff);
        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (26 - n)) & 0x3ffffff);
        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (26 - n)) & 0x3ffffff);
    }
    r[80] = a[80] >> n;
}

static WC_INLINE sp_digit sp_4096_div_word_81(sp_digit d1, sp_digit d0,
    sp_digit div)
{
#ifdef SP_USE_DIVTI3
    sp_int64 d = ((sp_int64)d1 << 26) + d0;

    return d / div;
#elif defined(__x86_64__) || defined(__i386__)
    sp_int64 d = ((sp_int64)d1 << 26) + d0;
    sp_uint32 lo = (sp_uint32)d;
    sp_digit hi = (sp_digit)(d >> 32);

    __asm__ __volatile__ (
        "idiv %2"
        : "+a" (lo)
        : "d" (hi), "r" (div)
        : "cc"
    );

    return (sp_digit)lo;
#elif !defined(__aarch64__) &&  !defined(SP_DIV_WORD_USE_DIV)
    sp_int64 d = ((sp_int64)d1 << 26) + d0;
    sp_digit dv = (div >> 1) + 1;
    sp_digit t1 = (sp_digit)(d >> 26);
    sp_digit t0 = (sp_digit)(d & 0x3ffffff);
    sp_digit t2;
    sp_digit sign;
    sp_digit r;
    int i;
    sp_int64 m;

    r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
    t1 -= dv & (0 - r);
    for (i = 24; i >= 1; i--) {
        t1 += t1 + (((sp_uint32)t0 >> 25) & 1);
        t0 <<= 1;
        t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
        r += r + t2;
        t1 -= dv & (0 - t2);
        t1 += t2;
    }
    r += r + 1;

    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 26);
    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 52) - (sp_digit)(d >> 52);

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;
   return r;
#else
    sp_int64 d = ((sp_int64)d1 << 26) + d0;
    sp_digit r = 0;
    sp_digit t;
    sp_digit dv = (div >> 11) + 1;

    t = (sp_digit)(d >> 22);
    t = (t / dv) << 11;
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)(d >> 7);
    t = t / (dv << 4);
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)d;
    t = t / div;
    r += t;
    d -= (sp_int64)t * div;
    return r;
#endif
}
static WC_INLINE sp_digit sp_4096_word_div_word_81(sp_digit d, sp_digit div)
{
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \
    defined(SP_DIV_WORD_USE_DIV)
    return d / div;
#else
    return (sp_digit)((sp_uint32)(div - d) >> 31);
#endif
}
/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * Full implementation.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_4096_div_81(const sp_digit* a, const sp_digit* d,
        const sp_digit* m, sp_digit* r)
{
    int i;
#ifndef WOLFSSL_SP_DIV_32
#endif
    sp_digit dv;
    sp_digit r1;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* t1 = NULL;
#else
    sp_digit t1[4 * 81 + 3];
#endif
    sp_digit* t2 = NULL;
    sp_digit* sd = NULL;
    int err = MP_OKAY;

    (void)m;

#ifdef WOLFSSL_SP_SMALL_STACK
    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 81 + 3), NULL,
                                                       DYNAMIC_TYPE_TMP_BUFFER);
    if (t1 == NULL)
        err = MEMORY_E;
#endif

    (void)m;

    if (err == MP_OKAY) {
        t2 = t1 + 162 + 1;
        sd = t2 + 81 + 1;

        sp_4096_mul_d_81(sd, d, (sp_digit)1 << 6);
        sp_4096_mul_d_162(t1, a, (sp_digit)1 << 6);
        dv = sd[78];
        t1[79 + 79] += t1[79 + 79 - 1] >> 26;
        t1[79 + 79 - 1] &= 0x3ffffff;
        for (i=79; i>=0; i--) {
            r1 = sp_4096_div_word_81(t1[79 + i], t1[79 + i - 1], dv);

            sp_4096_mul_d_81(t2, sd, r1);
            (void)sp_4096_sub_81(&t1[i], &t1[i], t2);
            sp_4096_norm_79(&t1[i]);
            t1[79 + i] += t1[79 + i - 1] >> 26;
            t1[79 + i - 1] &= 0x3ffffff;
            r1 = sp_4096_div_word_81(-t1[79 + i], -t1[79 + i - 1], dv);
            r1 -= t1[79 + i];
            sp_4096_mul_d_81(t2, sd, r1);
            (void)sp_4096_add_81(&t1[i], &t1[i], t2);
            t1[79 + i] += t1[79 + i - 1] >> 26;
            t1[79 + i - 1] &= 0x3ffffff;
        }
        t1[79 - 1] += t1[79 - 2] >> 26;
        t1[79 - 2] &= 0x3ffffff;
        r1 = sp_4096_word_div_word_81(t1[79 - 1], dv);

        sp_4096_mul_d_81(t2, sd, r1);
        sp_4096_sub_81(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 162U);
        for (i=0; i<78; i++) {
            r[i+1] += r[i] >> 26;
            r[i] &= 0x3ffffff;
        }
        sp_4096_cond_add_81(r, r, sd, r[78] >> 31);

        sp_4096_norm_79(r);
        sp_4096_rshift_81(r, r, 6);
        r[79] = 0;
        r[80] = 0;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t1 != NULL)
        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_4096_mod_81(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_4096_div_81(a, m, NULL, r);
}

/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_4096_mod_exp_81(sp_digit* r, const sp_digit* a, const sp_digit* e,
    int bits, const sp_digit* m, int reduceA)
{
#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 162];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 81 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 81 * 2);
            XMEMSET(t[i], 0, sizeof(sp_digit) * 81U * 2U);
        }

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_81(norm, m);

        if (reduceA != 0) {
            err = sp_4096_mod_81(t[1], a, m);
        }
        else {
            XMEMCPY(t[1], a, sizeof(sp_digit) * 81U);
        }
    }
    if (err == MP_OKAY) {
        sp_4096_mul_81(t[1], t[1], norm);
        err = sp_4096_mod_81(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 26;
        c = bits % 26;
        n = e[i--] << (26 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 26;
            }

            y = (int)((n >> 25) & 1);
            n <<= 1;

            sp_4096_mont_mul_81(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 81 * 2);
            sp_4096_mont_sqr_81(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 81 * 2);
        }

        sp_4096_mont_reduce_81(t[0], m, mp);
        n = sp_4096_cmp_81(t[0], m);
        sp_4096_cond_sub_81(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 81 * 2);

    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#elif !defined(WC_NO_CACHE_RESISTANT)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 162];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 81 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 81 * 2);
        }

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_81(norm, m);

        if (reduceA != 0) {
            err = sp_4096_mod_81(t[1], a, m);
            if (err == MP_OKAY) {
                sp_4096_mul_81(t[1], t[1], norm);
                err = sp_4096_mod_81(t[1], t[1], m);
            }
        }
        else {
            sp_4096_mul_81(t[1], a, norm);
            err = sp_4096_mod_81(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 26;
        c = bits % 26;
        n = e[i--] << (26 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 26;
            }

            y = (int)((n >> 25) & 1);
            n <<= 1;

            sp_4096_mont_mul_81(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 81 * 2);
            sp_4096_mont_sqr_81(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 81 * 2);
        }

        sp_4096_mont_reduce_81(t[0], m, mp);
        n = sp_4096_cmp_81(t[0], m);
        sp_4096_cond_sub_81(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 81 * 2);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[(32 * 162) + 162];
#endif
    sp_digit* t[32];
    sp_digit* rt = NULL;
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((32 * 162) + 162), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<32; i++)
            t[i] = td + i * 162;
        rt = td + 5184;

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_81(norm, m);

        if (reduceA != 0) {
            err = sp_4096_mod_81(t[1], a, m);
            if (err == MP_OKAY) {
                sp_4096_mul_81(t[1], t[1], norm);
                err = sp_4096_mod_81(t[1], t[1], m);
            }
        }
        else {
            sp_4096_mul_81(t[1], a, norm);
            err = sp_4096_mod_81(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_4096_mont_sqr_81(t[ 2], t[ 1], m, mp);
        sp_4096_mont_mul_81(t[ 3], t[ 2], t[ 1], m, mp);
        sp_4096_mont_sqr_81(t[ 4], t[ 2], m, mp);
        sp_4096_mont_mul_81(t[ 5], t[ 3], t[ 2], m, mp);
        sp_4096_mont_sqr_81(t[ 6], t[ 3], m, mp);
        sp_4096_mont_mul_81(t[ 7], t[ 4], t[ 3], m, mp);
        sp_4096_mont_sqr_81(t[ 8], t[ 4], m, mp);
        sp_4096_mont_mul_81(t[ 9], t[ 5], t[ 4], m, mp);
        sp_4096_mont_sqr_81(t[10], t[ 5], m, mp);
        sp_4096_mont_mul_81(t[11], t[ 6], t[ 5], m, mp);
        sp_4096_mont_sqr_81(t[12], t[ 6], m, mp);
        sp_4096_mont_mul_81(t[13], t[ 7], t[ 6], m, mp);
        sp_4096_mont_sqr_81(t[14], t[ 7], m, mp);
        sp_4096_mont_mul_81(t[15], t[ 8], t[ 7], m, mp);
        sp_4096_mont_sqr_81(t[16], t[ 8], m, mp);
        sp_4096_mont_mul_81(t[17], t[ 9], t[ 8], m, mp);
        sp_4096_mont_sqr_81(t[18], t[ 9], m, mp);
        sp_4096_mont_mul_81(t[19], t[10], t[ 9], m, mp);
        sp_4096_mont_sqr_81(t[20], t[10], m, mp);
        sp_4096_mont_mul_81(t[21], t[11], t[10], m, mp);
        sp_4096_mont_sqr_81(t[22], t[11], m, mp);
        sp_4096_mont_mul_81(t[23], t[12], t[11], m, mp);
        sp_4096_mont_sqr_81(t[24], t[12], m, mp);
        sp_4096_mont_mul_81(t[25], t[13], t[12], m, mp);
        sp_4096_mont_sqr_81(t[26], t[13], m, mp);
        sp_4096_mont_mul_81(t[27], t[14], t[13], m, mp);
        sp_4096_mont_sqr_81(t[28], t[14], m, mp);
        sp_4096_mont_mul_81(t[29], t[15], t[14], m, mp);
        sp_4096_mont_sqr_81(t[30], t[15], m, mp);
        sp_4096_mont_mul_81(t[31], t[16], t[15], m, mp);

        bits = ((bits + 4) / 5) * 5;
        i = ((bits + 25) / 26) - 1;
        c = bits % 26;
        if (c == 0) {
            c = 26;
        }
        if (i < 81) {
            n = e[i--] << (32 - c);
        }
        else {
            n = 0;
            i--;
        }
        if (c < 5) {
            n |= e[i--] << (6 - c);
            c += 26;
        }
        y = (int)((n >> 27) & 0x1f);
        n <<= 5;
        c -= 5;
        XMEMCPY(rt, t[y], sizeof(sp_digit) * 162);
        while ((i >= 0) || (c >= 5)) {
            if (c >= 5) {
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c -= 5;
            }
            else if (c == 0) {
                n = e[i--] << 6;
                y = (byte)((n >> 27) & 0x1f);
                n <<= 5;
                c = 21;
            }
            else {
                y = (byte)((n >> 27) & 0x1f);
                n = e[i--] << 6;
                c = 5 - c;
                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
                n <<= c;
                c = 26 - c;
            }

            sp_4096_mont_sqr_81(rt, rt, m, mp);
            sp_4096_mont_sqr_81(rt, rt, m, mp);
            sp_4096_mont_sqr_81(rt, rt, m, mp);
            sp_4096_mont_sqr_81(rt, rt, m, mp);
            sp_4096_mont_sqr_81(rt, rt, m, mp);

            sp_4096_mont_mul_81(rt, rt, t[y], m, mp);
        }

        sp_4096_mont_reduce_81(rt, m, mp);
        n = sp_4096_cmp_81(rt, m);
        sp_4096_cond_sub_81(rt, rt, m, ~(n >> 31));
        XMEMCPY(r, rt, sizeof(sp_digit) * 162);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}

#endif /* WOLFSSL_HAVE_SP_RSA & !SP_RSA_PRIVATE_EXP_D */
#endif /* (WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH) & !WOLFSSL_RSA_PUBLIC_ONLY */

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 4096 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A single precision number.
 */
static void sp_4096_mont_norm_162(sp_digit* r, const sp_digit* m)
{
    /* Set r = 2^n - 1. */
    int i;

    for (i = 0; i < 152; i += 8) {
        r[i + 0] = 0x3ffffff;
        r[i + 1] = 0x3ffffff;
        r[i + 2] = 0x3ffffff;
        r[i + 3] = 0x3ffffff;
        r[i + 4] = 0x3ffffff;
        r[i + 5] = 0x3ffffff;
        r[i + 6] = 0x3ffffff;
        r[i + 7] = 0x3ffffff;
    }
    r[152] = 0x3ffffff;
    r[153] = 0x3ffffff;
    r[154] = 0x3ffffff;
    r[155] = 0x3ffffff;
    r[156] = 0x3ffffff;
    r[157] = 0x3fffL;
    r[158] = 0;
    r[159] = 0;
    r[160] = 0;
    r[161] = 0;

    /* r = (2^n - 1) mod n */
    (void)sp_4096_sub_162(r, r, m);

    /* Add one so r = 2^n mod m */
    r[0] += 1;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_4096_cmp_162(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
    int i;

    r |= (a[161] - b[161]) & (0 - (sp_digit)1);
    r |= (a[160] - b[160]) & ~(((sp_digit)0 - r) >> 25);
    for (i = 152; i >= 0; i -= 8) {
        r |= (a[i + 7] - b[i + 7]) & ~(((sp_digit)0 - r) >> 25);
        r |= (a[i + 6] - b[i + 6]) & ~(((sp_digit)0 - r) >> 25);
        r |= (a[i + 5] - b[i + 5]) & ~(((sp_digit)0 - r) >> 25);
        r |= (a[i + 4] - b[i + 4]) & ~(((sp_digit)0 - r) >> 25);
        r |= (a[i + 3] - b[i + 3]) & ~(((sp_digit)0 - r) >> 25);
        r |= (a[i + 2] - b[i + 2]) & ~(((sp_digit)0 - r) >> 25);
        r |= (a[i + 1] - b[i + 1]) & ~(((sp_digit)0 - r) >> 25);
        r |= (a[i + 0] - b[i + 0]) & ~(((sp_digit)0 - r) >> 25);
    }

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_4096_cond_sub_162(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 160; i += 8) {
        r[i + 0] = a[i + 0] - (b[i + 0] & m);
        r[i + 1] = a[i + 1] - (b[i + 1] & m);
        r[i + 2] = a[i + 2] - (b[i + 2] & m);
        r[i + 3] = a[i + 3] - (b[i + 3] & m);
        r[i + 4] = a[i + 4] - (b[i + 4] & m);
        r[i + 5] = a[i + 5] - (b[i + 5] & m);
        r[i + 6] = a[i + 6] - (b[i + 6] & m);
        r[i + 7] = a[i + 7] - (b[i + 7] & m);
    }
    r[160] = a[160] - (b[160] & m);
    r[161] = a[161] - (b[161] & m);
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_4096_mul_add_162(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifndef WOLFSSL_SP_LARGE_CODE
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 162; i++) {
        t += r[i];
        t += tb * a[i];
        r[i] = ((sp_digit)t) & 0x3ffffff;
        t >>= 26;
    }
    r[162] += (sp_digit)t;
#else
    sp_int64 tb = b;
    sp_int64 t[8];
    int i;

    t[0] = 0;
    for (i = 0; i < 160; i += 8) {
        t[0] += (tb * a[i+0]) + r[i+0];
        t[1]  = (tb * a[i+1]) + r[i+1];
        t[2]  = (tb * a[i+2]) + r[i+2];
        t[3]  = (tb * a[i+3]) + r[i+3];
        t[4]  = (tb * a[i+4]) + r[i+4];
        t[5]  = (tb * a[i+5]) + r[i+5];
        t[6]  = (tb * a[i+6]) + r[i+6];
        t[7]  = (tb * a[i+7]) + r[i+7];
        r[i+0] = t[0] & 0x3ffffff;
        t[1] += t[0] >> 26;
        r[i+1] = t[1] & 0x3ffffff;
        t[2] += t[1] >> 26;
        r[i+2] = t[2] & 0x3ffffff;
        t[3] += t[2] >> 26;
        r[i+3] = t[3] & 0x3ffffff;
        t[4] += t[3] >> 26;
        r[i+4] = t[4] & 0x3ffffff;
        t[5] += t[4] >> 26;
        r[i+5] = t[5] & 0x3ffffff;
        t[6] += t[5] >> 26;
        r[i+6] = t[6] & 0x3ffffff;
        t[7] += t[6] >> 26;
        r[i+7] = t[7] & 0x3ffffff;
        t[0]  = t[7] >> 26;
    }
    t[0] += (tb * a[160]) + r[160];
    t[1]  = (tb * a[161]) + r[161];
    r[160] = t[0] & 0x3ffffff;
    t[1] += t[0] >> 26;
    r[161] = t[1] & 0x3ffffff;
    r[162] +=  (sp_digit)(t[1] >> 26);
#endif /* !WOLFSSL_SP_LARGE_CODE */
}

/* Shift the result in the high 4096 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_4096_mont_shift_162(sp_digit* r, const sp_digit* a)
{
    int i;
    sp_int64 n = a[157] >> 14;
    n += ((sp_int64)a[158]) << 12;
    for (i = 0; i < 152; i += 8) {
        r[i + 0] = n & 0x3ffffff;
        n >>= 26; n += ((sp_int64)a[i + 159]) << 12;
        r[i + 1] = n & 0x3ffffff;
        n >>= 26; n += ((sp_int64)a[i + 160]) << 12;
        r[i + 2] = n & 0x3ffffff;
        n >>= 26; n += ((sp_int64)a[i + 161]) << 12;
        r[i + 3] = n & 0x3ffffff;
        n >>= 26; n += ((sp_int64)a[i + 162]) << 12;
        r[i + 4] = n & 0x3ffffff;
        n >>= 26; n += ((sp_int64)a[i + 163]) << 12;
        r[i + 5] = n & 0x3ffffff;
        n >>= 26; n += ((sp_int64)a[i + 164]) << 12;
        r[i + 6] = n & 0x3ffffff;
        n >>= 26; n += ((sp_int64)a[i + 165]) << 12;
        r[i + 7] = n & 0x3ffffff;
        n >>= 26; n += ((sp_int64)a[i + 166]) << 12;
    }
    r[152] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[311]) << 12;
    r[153] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[312]) << 12;
    r[154] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[313]) << 12;
    r[155] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[314]) << 12;
    r[156] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[315]) << 12;
    r[157] = (sp_digit)n;
    XMEMSET(&r[158], 0, sizeof(*r) * 158U);
}

/* Reduce the number back to 4096 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_4096_mont_reduce_162(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;
    sp_digit over;

    sp_4096_norm_162(a + 158);

#ifdef WOLFSSL_SP_DH
    if (mp != 1) {
        for (i=0; i<157; i++) {
            mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x3ffffff;
            sp_4096_mul_add_162(a+i, m, mu);
            a[i+1] += a[i] >> 26;
        }
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x3fffL;
        sp_4096_mul_add_162(a+i, m, mu);
        a[i+1] += a[i] >> 26;
        a[i] &= 0x3ffffff;
    }
    else {
        for (i=0; i<157; i++) {
            mu = a[i] & 0x3ffffff;
            sp_4096_mul_add_162(a+i, m, mu);
            a[i+1] += a[i] >> 26;
        }
        mu = a[i] & 0x3fffL;
        sp_4096_mul_add_162(a+i, m, mu);
        a[i+1] += a[i] >> 26;
        a[i] &= 0x3ffffff;
    }
#else
    for (i=0; i<157; i++) {
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x3ffffff;
        sp_4096_mul_add_162(a+i, m, mu);
        a[i+1] += a[i] >> 26;
    }
    mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x3fffL;
    sp_4096_mul_add_162(a+i, m, mu);
    a[i+1] += a[i] >> 26;
    a[i] &= 0x3ffffff;
#endif
    sp_4096_mont_shift_162(a, a);
    over = a[157] - m[157];
    sp_4096_cond_sub_162(a, a, m, ~((over - 1) >> 31));
    sp_4096_norm_162(a);
}

/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_4096_mont_mul_162(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_4096_mul_162(r, a, b);
    sp_4096_mont_reduce_162(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_4096_mont_sqr_162(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_4096_sqr_162(r, a);
    sp_4096_mont_reduce_162(r, m, mp);
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_4096_mul_d_324(sp_digit* r, const sp_digit* a,
    sp_digit b)
{
    sp_int64 tb = b;
    sp_int64 t = 0;
    sp_digit t2;
    sp_int64 p[4];
    int i;

    for (i = 0; i < 324; i += 4) {
        p[0] = tb * a[i + 0];
        p[1] = tb * a[i + 1];
        p[2] = tb * a[i + 2];
        p[3] = tb * a[i + 3];
        t += p[0];
        t2 = (sp_digit)(t & 0x3ffffff);
        t >>= 26;
        r[i + 0] = (sp_digit)t2;
        t += p[1];
        t2 = (sp_digit)(t & 0x3ffffff);
        t >>= 26;
        r[i + 1] = (sp_digit)t2;
        t += p[2];
        t2 = (sp_digit)(t & 0x3ffffff);
        t >>= 26;
        r[i + 2] = (sp_digit)t2;
        t += p[3];
        t2 = (sp_digit)(t & 0x3ffffff);
        t >>= 26;
        r[i + 3] = (sp_digit)t2;
    }
    r[324] = (sp_digit)(t & 0x3ffffff);
}

#ifndef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_4096_cond_add_162(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 160; i += 8) {
        r[i + 0] = a[i + 0] + (b[i + 0] & m);
        r[i + 1] = a[i + 1] + (b[i + 1] & m);
        r[i + 2] = a[i + 2] + (b[i + 2] & m);
        r[i + 3] = a[i + 3] + (b[i + 3] & m);
        r[i + 4] = a[i + 4] + (b[i + 4] & m);
        r[i + 5] = a[i + 5] + (b[i + 5] & m);
        r[i + 6] = a[i + 6] + (b[i + 6] & m);
        r[i + 7] = a[i + 7] + (b[i + 7] & m);
    }
    r[160] = a[160] + (b[160] & m);
    r[161] = a[161] + (b[161] & m);
}
#endif /* !WOLFSSL_SP_SMALL */

SP_NOINLINE static void sp_4096_rshift_162(sp_digit* r, const sp_digit* a,
        byte n)
{
    int i;

    for (i=0; i<160; i += 8) {
        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (26 - n)) & 0x3ffffff);
        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (26 - n)) & 0x3ffffff);
        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (26 - n)) & 0x3ffffff);
        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (26 - n)) & 0x3ffffff);
        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (26 - n)) & 0x3ffffff);
        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (26 - n)) & 0x3ffffff);
        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (26 - n)) & 0x3ffffff);
        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (26 - n)) & 0x3ffffff);
    }
    r[160] = (a[160] >> n) | ((a[161] << (26 - n)) & 0x3ffffff);
    r[161] = a[161] >> n;
}

static WC_INLINE sp_digit sp_4096_div_word_162(sp_digit d1, sp_digit d0,
    sp_digit div)
{
#ifdef SP_USE_DIVTI3
    sp_int64 d = ((sp_int64)d1 << 26) + d0;

    return d / div;
#elif defined(__x86_64__) || defined(__i386__)
    sp_int64 d = ((sp_int64)d1 << 26) + d0;
    sp_uint32 lo = (sp_uint32)d;
    sp_digit hi = (sp_digit)(d >> 32);

    __asm__ __volatile__ (
        "idiv %2"
        : "+a" (lo)
        : "d" (hi), "r" (div)
        : "cc"
    );

    return (sp_digit)lo;
#elif !defined(__aarch64__) &&  !defined(SP_DIV_WORD_USE_DIV)
    sp_int64 d = ((sp_int64)d1 << 26) + d0;
    sp_digit dv = (div >> 1) + 1;
    sp_digit t1 = (sp_digit)(d >> 26);
    sp_digit t0 = (sp_digit)(d & 0x3ffffff);
    sp_digit t2;
    sp_digit sign;
    sp_digit r;
    int i;
    sp_int64 m;

    r = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
    t1 -= dv & (0 - r);
    for (i = 24; i >= 1; i--) {
        t1 += t1 + (((sp_uint32)t0 >> 25) & 1);
        t0 <<= 1;
        t2 = (sp_digit)(((sp_uint32)(dv - t1)) >> 31);
        r += r + t2;
        t1 -= dv & (0 - t2);
        t1 += t2;
    }
    r += r + 1;

    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 26);
    m = d - ((sp_int64)r * div);
    r += (sp_digit)(m >> 52) - (sp_digit)(d >> 52);

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;

    m = d - ((sp_int64)r * div);
    sign = (sp_digit)(0 - ((sp_uint32)m >> 31)) * 2 + 1;
    m *= sign;
    t2 = (sp_digit)(((sp_uint32)(div - m)) >> 31);
    r += sign * t2;
   return r;
#else
    sp_int64 d = ((sp_int64)d1 << 26) + d0;
    sp_digit r = 0;
    sp_digit t;
    sp_digit dv = (div >> 11) + 1;

    t = (sp_digit)(d >> 22);
    t = (t / dv) << 11;
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)(d >> 7);
    t = t / (dv << 4);
    r += t;
    d -= (sp_int64)t * div;
    t = (sp_digit)d;
    t = t / div;
    r += t;
    d -= (sp_int64)t * div;
    return r;
#endif
}
static WC_INLINE sp_digit sp_4096_word_div_word_162(sp_digit d, sp_digit div)
{
#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \
    defined(SP_DIV_WORD_USE_DIV)
    return d / div;
#else
    return (sp_digit)((sp_uint32)(div - d) >> 31);
#endif
}
/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * Full implementation.
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_4096_div_162(const sp_digit* a, const sp_digit* d,
        const sp_digit* m, sp_digit* r)
{
    int i;
#ifndef WOLFSSL_SP_DIV_32
#endif
    sp_digit dv;
    sp_digit r1;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* t1 = NULL;
#else
    sp_digit t1[4 * 162 + 3];
#endif
    sp_digit* t2 = NULL;
    sp_digit* sd = NULL;
    int err = MP_OKAY;

    (void)m;

#ifdef WOLFSSL_SP_SMALL_STACK
    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 162 + 3), NULL,
                                                       DYNAMIC_TYPE_TMP_BUFFER);
    if (t1 == NULL)
        err = MEMORY_E;
#endif

    (void)m;

    if (err == MP_OKAY) {
        t2 = t1 + 324 + 1;
        sd = t2 + 162 + 1;

        sp_4096_mul_d_162(sd, d, (sp_digit)1 << 12);
        sp_4096_mul_d_324(t1, a, (sp_digit)1 << 12);
        dv = sd[157];
        t1[158 + 158] += t1[158 + 158 - 1] >> 26;
        t1[158 + 158 - 1] &= 0x3ffffff;
        for (i=158; i>=0; i--) {
            r1 = sp_4096_div_word_162(t1[158 + i], t1[158 + i - 1], dv);

            sp_4096_mul_d_162(t2, sd, r1);
            (void)sp_4096_sub_162(&t1[i], &t1[i], t2);
            sp_4096_norm_158(&t1[i]);
            t1[158 + i] += t1[158 + i - 1] >> 26;
            t1[158 + i - 1] &= 0x3ffffff;
            r1 = sp_4096_div_word_162(-t1[158 + i], -t1[158 + i - 1], dv);
            r1 -= t1[158 + i];
            sp_4096_mul_d_162(t2, sd, r1);
            (void)sp_4096_add_162(&t1[i], &t1[i], t2);
            t1[158 + i] += t1[158 + i - 1] >> 26;
            t1[158 + i - 1] &= 0x3ffffff;
        }
        t1[158 - 1] += t1[158 - 2] >> 26;
        t1[158 - 2] &= 0x3ffffff;
        r1 = sp_4096_word_div_word_162(t1[158 - 1], dv);

        sp_4096_mul_d_162(t2, sd, r1);
        sp_4096_sub_162(t1, t1, t2);
        XMEMCPY(r, t1, sizeof(*r) * 324U);
        for (i=0; i<157; i++) {
            r[i+1] += r[i] >> 26;
            r[i] &= 0x3ffffff;
        }
        sp_4096_cond_add_162(r, r, sd, r[157] >> 31);

        sp_4096_norm_158(r);
        sp_4096_rshift_162(r, r, 12);
        r[158] = 0;
        r[159] = 0;
        r[160] = 0;
        r[161] = 0;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t1 != NULL)
        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_4096_mod_162(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_4096_div_162(a, m, NULL, r);
}

#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || defined(WOLFSSL_HAVE_SP_DH)
#if (defined(WOLFSSL_HAVE_SP_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
                                                     defined(WOLFSSL_HAVE_SP_DH)
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even or exponent is 0.
 */
static int sp_4096_mod_exp_162(sp_digit* r, const sp_digit* a, const sp_digit* e,
    int bits, const sp_digit* m, int reduceA)
{
#if defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SP_FAST_MODEXP)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 324];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 162 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 162 * 2);
            XMEMSET(t[i], 0, sizeof(sp_digit) * 162U * 2U);
        }

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_162(norm, m);

        if (reduceA != 0) {
            err = sp_4096_mod_162(t[1], a, m);
        }
        else {
            XMEMCPY(t[1], a, sizeof(sp_digit) * 162U);
        }
    }
    if (err == MP_OKAY) {
        sp_4096_mul_162(t[1], t[1], norm);
        err = sp_4096_mod_162(t[1], t[1], m);
    }

    if (err == MP_OKAY) {
        i = bits / 26;
        c = bits % 26;
        n = e[i--] << (26 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 26;
            }

            y = (int)((n >> 25) & 1);
            n <<= 1;

            sp_4096_mont_mul_162(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 162 * 2);
            sp_4096_mont_sqr_162(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 162 * 2);
        }

        sp_4096_mont_reduce_162(t[0], m, mp);
        n = sp_4096_cmp_162(t[0], m);
        sp_4096_cond_sub_162(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 162 * 2);

    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#elif !defined(WC_NO_CACHE_RESISTANT)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[3 * 324];
#endif
    sp_digit* t[3] = {0, 0, 0};
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 3 * 162 * 2, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<3; i++) {
            t[i] = td + (i * 162 * 2);
        }

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_162(norm, m);

        if (reduceA != 0) {
            err = sp_4096_mod_162(t[1], a, m);
            if (err == MP_OKAY) {
                sp_4096_mul_162(t[1], t[1], norm);
                err = sp_4096_mod_162(t[1], t[1], m);
            }
        }
        else {
            sp_4096_mul_162(t[1], a, norm);
            err = sp_4096_mod_162(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        i = bits / 26;
        c = bits % 26;
        n = e[i--] << (26 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1) {
                    break;
                }

                n = e[i--];
                c = 26;
            }

            y = (int)((n >> 25) & 1);
            n <<= 1;

            sp_4096_mont_mul_162(t[y^1], t[0], t[1], m, mp);

            XMEMCPY(t[2], (void*)(((size_t)t[0] & addr_mask[y^1]) +
                                  ((size_t)t[1] & addr_mask[y])),
                                  sizeof(*t[2]) * 162 * 2);
            sp_4096_mont_sqr_162(t[2], t[2], m, mp);
            XMEMCPY((void*)(((size_t)t[0] & addr_mask[y^1]) +
                            ((size_t)t[1] & addr_mask[y])), t[2],
                            sizeof(*t[2]) * 162 * 2);
        }

        sp_4096_mont_reduce_162(t[0], m, mp);
        n = sp_4096_cmp_162(t[0], m);
        sp_4096_cond_sub_162(t[0], t[0], m, ~(n >> 31));
        XMEMCPY(r, t[0], sizeof(*r) * 162 * 2);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[(16 * 324) + 324];
#endif
    sp_digit* t[16];
    sp_digit* rt = NULL;
    sp_digit* norm = NULL;
    sp_digit mp = 1;
    sp_digit n;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * ((16 * 324) + 324), NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        for (i=0; i<16; i++)
            t[i] = td + i * 324;
        rt = td + 5184;

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_162(norm, m);

        if (reduceA != 0) {
            err = sp_4096_mod_162(t[1], a, m);
            if (err == MP_OKAY) {
                sp_4096_mul_162(t[1], t[1], norm);
                err = sp_4096_mod_162(t[1], t[1], m);
            }
        }
        else {
            sp_4096_mul_162(t[1], a, norm);
            err = sp_4096_mod_162(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_4096_mont_sqr_162(t[ 2], t[ 1], m, mp);
        sp_4096_mont_mul_162(t[ 3], t[ 2], t[ 1], m, mp);
        sp_4096_mont_sqr_162(t[ 4], t[ 2], m, mp);
        sp_4096_mont_mul_162(t[ 5], t[ 3], t[ 2], m, mp);
        sp_4096_mont_sqr_162(t[ 6], t[ 3], m, mp);
        sp_4096_mont_mul_162(t[ 7], t[ 4], t[ 3], m, mp);
        sp_4096_mont_sqr_162(t[ 8], t[ 4], m, mp);
        sp_4096_mont_mul_162(t[ 9], t[ 5], t[ 4], m, mp);
        sp_4096_mont_sqr_162(t[10], t[ 5], m, mp);
        sp_4096_mont_mul_162(t[11], t[ 6], t[ 5], m, mp);
        sp_4096_mont_sqr_162(t[12], t[ 6], m, mp);
        sp_4096_mont_mul_162(t[13], t[ 7], t[ 6], m, mp);
        sp_4096_mont_sqr_162(t[14], t[ 7], m, mp);
        sp_4096_mont_mul_162(t[15], t[ 8], t[ 7], m, mp);

        bits = ((bits + 3) / 4) * 4;
        i = ((bits + 25) / 26) - 1;
        c = bits % 26;
        if (c == 0) {
            c = 26;
        }
        if (i < 162) {
            n = e[i--] << (32 - c);
        }
        else {
            n = 0;
            i--;
        }
        if (c < 4) {
            n |= e[i--] << (6 - c);
            c += 26;
        }
        y = (int)((n >> 28) & 0xf);
        n <<= 4;
        c -= 4;
        XMEMCPY(rt, t[y], sizeof(sp_digit) * 324);
        while ((i >= 0) || (c >= 4)) {
            if (c >= 4) {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }
            else if (c == 0) {
                n = e[i--] << 6;
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c = 22;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n = e[i--] << 6;
                c = 4 - c;
                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
                n <<= c;
                c = 26 - c;
            }

            sp_4096_mont_sqr_162(rt, rt, m, mp);
            sp_4096_mont_sqr_162(rt, rt, m, mp);
            sp_4096_mont_sqr_162(rt, rt, m, mp);
            sp_4096_mont_sqr_162(rt, rt, m, mp);

            sp_4096_mont_mul_162(rt, rt, t[y], m, mp);
        }

        sp_4096_mont_reduce_162(rt, m, mp);
        n = sp_4096_cmp_162(rt, m);
        sp_4096_cond_sub_162(rt, rt, m, ~(n >> 31));
        XMEMCPY(r, rt, sizeof(sp_digit) * 324);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
#endif
}
#endif /* (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) || */
       /* WOLFSSL_HAVE_SP_DH */

#endif /* (WOLFSSL_HAVE_SP_RSA && !WOLFSSL_RSA_PUBLIC_ONLY) || WOLFSSL_HAVE_SP_DH */
#ifdef WOLFSSL_HAVE_SP_RSA
/* RSA public key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * em      Public exponent.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 512 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPublic_4096(const byte* in, word32 inLen, const mp_int* em,
    const mp_int* mm, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[162 * 5];
#endif
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    sp_digit* norm = NULL;
    sp_uint64 e[1] = {0};
    sp_digit mp = 0;
    int i;
    int err = MP_OKAY;

    if (*outLen < 512U) {
        err = MP_TO_E;
    }

    if (err == MP_OKAY) {
        if (mp_count_bits(em) > 64) {
            err = MP_READ_E;
        }
        else if (inLen > 512U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 4096) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 162 * 5, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        r = a + 162 * 2;
        m = r + 162 * 2;
        norm = r;

        sp_4096_from_bin(a, 162, in, inLen);
#if DIGIT_BIT >= 64
        e[0] = (sp_uint64)em->dp[0];
#else
        e[0] = (sp_uint64)em->dp[0];
        if (em->used > 1) {
            e[0] |= ((sp_uint64)em->dp[1]) << DIGIT_BIT;
        }
#endif
        if (e[0] == 0) {
            err = MP_EXPTMOD_E;
        }
    }

    if (err == MP_OKAY) {
        sp_4096_from_mp(m, 162, mm);

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_162(norm, m);
    }
    if (err == MP_OKAY) {
        sp_4096_mul_162(a, a, norm);
        err = sp_4096_mod_162(a, a, m);
    }
    if (err == MP_OKAY) {
        for (i=63; i>=0; i--) {
            if ((e[0] >> i) != 0) {
                break;
            }
        }

        XMEMCPY(r, a, sizeof(sp_digit) * 162 * 2);
        for (i--; i>=0; i--) {
            sp_4096_mont_sqr_162(r, r, m, mp);

            if (((e[0] >> i) & 1) == 1) {
                sp_4096_mont_mul_162(r, r, a, m, mp);
            }
        }
        sp_4096_mont_reduce_162(r, m, mp);
        mp = sp_4096_cmp_162(r, m);
        sp_4096_cond_sub_162(r, r, m, ~(mp >> 31));

        sp_4096_to_bin_162(r, out);
        *outLen = 512;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (a != NULL)
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
#endif

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit d[162 * 5];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    sp_uint64 e[1] = {0};
    int err = MP_OKAY;

    if (*outLen < 512U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(em) > 64) {
            err = MP_READ_E;
        }
        else if (inLen > 512U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 4096) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 162 * 5, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d;
        r = a + 162 * 2;
        m = r + 162 * 2;

        sp_4096_from_bin(a, 162, in, inLen);
#if DIGIT_BIT >= 64
        e[0] = (sp_uint64)em->dp[0];
#else
        e[0] = (sp_uint64)em->dp[0];
        if (em->used > 1) {
            e[0] |= ((sp_uint64)em->dp[1]) << DIGIT_BIT;
        }
#endif
        if (e[0] == 0) {
            err = MP_EXPTMOD_E;
        }
    }
    if (err == MP_OKAY) {
        sp_4096_from_mp(m, 162, mm);

        if (e[0] == 0x3) {
            sp_4096_sqr_162(r, a);
            err = sp_4096_mod_162(r, r, m);
            if (err == MP_OKAY) {
                sp_4096_mul_162(r, a, r);
                err = sp_4096_mod_162(r, r, m);
            }
        }
        else {
            sp_digit* norm = r;
            int i;
            sp_digit mp;

            sp_4096_mont_setup(m, &mp);
            sp_4096_mont_norm_162(norm, m);

            sp_4096_mul_162(a, a, norm);
            err = sp_4096_mod_162(a, a, m);

            if (err == MP_OKAY) {
                for (i=63; i>=0; i--) {
                    if ((e[0] >> i) != 0) {
                        break;
                    }
                }

                XMEMCPY(r, a, sizeof(sp_digit) * 324U);
                for (i--; i>=0; i--) {
                    sp_4096_mont_sqr_162(r, r, m, mp);

                    if (((e[0] >> i) & 1) == 1) {
                        sp_4096_mont_mul_162(r, r, a, m, mp);
                    }
                }
                sp_4096_mont_reduce_162(r, m, mp);
                mp = sp_4096_cmp_162(r, m);
                sp_4096_cond_sub_162(r, r, m, ~(mp >> 31));
            }
        }
    }

    if (err == MP_OKAY) {
        sp_4096_to_bin_162(r, out);
        *outLen = 512;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif

    return err;
#endif /* WOLFSSL_SP_SMALL */
}

#ifndef WOLFSSL_RSA_PUBLIC_ONLY
#if !defined(SP_RSA_PRIVATE_EXP_D) && !defined(RSA_LOW_MEM)
#endif /* !SP_RSA_PRIVATE_EXP_D & !RSA_LOW_MEM */
/* RSA private key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * dm      Private exponent.
 * pm      First prime.
 * qm      Second prime.
 * dpm     First prime's CRT exponent.
 * dqm     Second prime's CRT exponent.
 * qim     Inverse of second prime mod p.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 512 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPrivate_4096(const byte* in, word32 inLen, const mp_int* dm,
    const mp_int* pm, const mp_int* qm, const mp_int* dpm, const mp_int* dqm,
    const mp_int* qim, const mp_int* mm, byte* out, word32* outLen)
{
#if defined(SP_RSA_PRIVATE_EXP_D) || defined(RSA_LOW_MEM)
#if defined(WOLFSSL_SP_SMALL)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit  d[162 * 4];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 512U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(dm) > 4096) {
           err = MP_READ_E;
        }
        else if (inLen > 512) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 4096) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 162 * 4, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d + 162;
        m = a + 324;
        r = a;

        sp_4096_from_bin(a, 162, in, inLen);
        sp_4096_from_mp(d, 162, dm);
        sp_4096_from_mp(m, 162, mm);
        err = sp_4096_mod_exp_162(r, a, d, 4096, m, 0);
    }

    if (err == MP_OKAY) {
        sp_4096_to_bin_162(r, out);
        *outLen = 512;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
#endif
    {
        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
        if (a != NULL)
            ForceZero(a, sizeof(sp_digit) * 162);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* d = NULL;
#else
    sp_digit d[162 * 4];
#endif
    sp_digit* a = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)pm;
    (void)qm;
    (void)dpm;
    (void)dqm;
    (void)qim;

    if (*outLen < 512U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (mp_count_bits(dm) > 4096) {
            err = MP_READ_E;
        }
        else if (inLen > 512U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 4096) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 162 * 4, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (d == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        a = d + 162;
        m = a + 324;
        r = a;

        sp_4096_from_bin(a, 162, in, inLen);
        sp_4096_from_mp(d, 162, dm);
        sp_4096_from_mp(m, 162, mm);
        err = sp_4096_mod_exp_162(r, a, d, 4096, m, 0);
    }

    if (err == MP_OKAY) {
        sp_4096_to_bin_162(r, out);
        *outLen = 512;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (d != NULL)
#endif
    {
        /* only "a" and "r" are sensitive and need zeroized (same pointer) */
        if (a != NULL)
            ForceZero(a, sizeof(sp_digit) * 162);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(d, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#endif /* WOLFSSL_SP_SMALL */
#else
#if defined(WOLFSSL_SP_SMALL)
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[81 * 8];
#endif
    sp_digit* p = NULL;
    sp_digit* dp = NULL;
    sp_digit* dq = NULL;
    sp_digit* qi = NULL;
    sp_digit* tmpa = NULL;
    sp_digit* tmpb = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 512U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (inLen > 512) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 4096) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(pm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(qm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 81 * 8, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif
    if (err == MP_OKAY) {
        p = a + 162;
        qi = dq = dp = p + 81;
        tmpa = qi + 81;
        tmpb = tmpa + 162;
        r = a;

        sp_4096_from_bin(a, 162, in, inLen);
        sp_4096_from_mp(p, 81, pm);
        sp_4096_from_mp(dp, 81, dpm);
        err = sp_4096_mod_exp_81(tmpa, a, dp, 2048, p, 1);
    }
    if (err == MP_OKAY) {
        sp_4096_from_mp(p, 81, qm);
        sp_4096_from_mp(dq, 81, dqm);
        err = sp_4096_mod_exp_81(tmpb, a, dq, 2048, p, 1);
    }
    if (err == MP_OKAY) {
        sp_4096_from_mp(p, 81, pm);
        (void)sp_4096_sub_81(tmpa, tmpa, tmpb);
        sp_4096_norm_79(tmpa);
        sp_4096_cond_add_81(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[78] >> 31));
        sp_4096_cond_add_81(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[78] >> 31));
        sp_4096_norm_81(tmpa);

        sp_4096_from_mp(qi, 81, qim);
        sp_4096_mul_81(tmpa, tmpa, qi);
        err = sp_4096_mod_81(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_4096_from_mp(p, 81, qm);
        sp_4096_mul_81(tmpa, p, tmpa);
        (void)sp_4096_add_162(r, tmpb, tmpa);
        sp_4096_norm_162(r);

        sp_4096_to_bin_162(r, out);
        *outLen = 512;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (a != NULL)
#endif
    {
        ForceZero(a, sizeof(sp_digit) * 81 * 8);
#ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
#endif
    }

    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* a = NULL;
#else
    sp_digit a[81 * 13];
#endif
    sp_digit* p = NULL;
    sp_digit* q = NULL;
    sp_digit* dp = NULL;
    sp_digit* dq = NULL;
    sp_digit* qi = NULL;
    sp_digit* tmpa = NULL;
    sp_digit* tmpb = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;

    (void)dm;
    (void)mm;

    if (*outLen < 512U) {
        err = MP_TO_E;
    }
    if (err == MP_OKAY) {
        if (inLen > 512U) {
            err = MP_READ_E;
        }
        else if (mp_count_bits(mm) != 4096) {
            err = MP_READ_E;
        }
        else if (mp_iseven(mm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(pm)) {
            err = MP_VAL;
        }
        else if (mp_iseven(qm)) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        a = (sp_digit*)XMALLOC(sizeof(sp_digit) * 81 * 13, NULL,
                                                              DYNAMIC_TYPE_RSA);
        if (a == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        p = a + 162 * 2;
        q = p + 81;
        dp = q + 81;
        dq = dp + 81;
        qi = dq + 81;
        tmpa = qi + 81;
        tmpb = tmpa + 162;
        r = a;

        sp_4096_from_bin(a, 162, in, inLen);
        sp_4096_from_mp(p, 81, pm);
        sp_4096_from_mp(q, 81, qm);
        sp_4096_from_mp(dp, 81, dpm);
        sp_4096_from_mp(dq, 81, dqm);
        sp_4096_from_mp(qi, 81, qim);

        err = sp_4096_mod_exp_81(tmpa, a, dp, 2048, p, 1);
    }
    if (err == MP_OKAY) {
        err = sp_4096_mod_exp_81(tmpb, a, dq, 2048, q, 1);
    }

    if (err == MP_OKAY) {
        (void)sp_4096_sub_81(tmpa, tmpa, tmpb);
        sp_4096_norm_79(tmpa);
        sp_4096_cond_add_81(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[78] >> 31));
        sp_4096_cond_add_81(tmpa, tmpa, p, 0 - ((sp_int_digit)tmpa[78] >> 31));
        sp_4096_norm_81(tmpa);
        sp_4096_mul_81(tmpa, tmpa, qi);
        err = sp_4096_mod_81(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
        sp_4096_mul_81(tmpa, tmpa, q);
        (void)sp_4096_add_162(r, tmpb, tmpa);
        sp_4096_norm_162(r);

        sp_4096_to_bin_162(r, out);
        *outLen = 512;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
if (a != NULL)
#endif
    {
        ForceZero(a, sizeof(sp_digit) * 81 * 13);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(a, NULL, DYNAMIC_TYPE_RSA);
    #endif
    }

    return err;
#endif /* WOLFSSL_SP_SMALL */
#endif /* SP_RSA_PRIVATE_EXP_D || RSA_LOW_MEM */
}

#endif /* !WOLFSSL_RSA_PUBLIC_ONLY */
#endif /* WOLFSSL_HAVE_SP_RSA */
#if defined(WOLFSSL_HAVE_SP_DH) || (defined(WOLFSSL_HAVE_SP_RSA) && \
                                              !defined(WOLFSSL_RSA_PUBLIC_ONLY))
/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_4096_to_mp(const sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (4096 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
#if DIGIT_BIT == 26
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 158);
        r->used = 158;
        mp_clamp(r);
#elif DIGIT_BIT < 26
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 158; i++) {
            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 26) {
                s += DIGIT_BIT;
                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
                if (s == SP_WORD_SIZE) {
                    r->dp[j] = 0;
                }
                else {
                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 26 - s;
        }
        r->used = (4096 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 158; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 26 >= DIGIT_BIT) {
    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 26 - s;
            }
            else {
                s += 26;
            }
        }
        r->used = (4096 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_4096(const mp_int* base, const mp_int* exp, const mp_int* mod,
    mp_int* res)
{
#ifdef WOLFSSL_SP_SMALL
    int err = MP_OKAY;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[162 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 4096) {
        err = MP_READ_E;
    }
    else if (expBits > 4096) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 4096) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 162 * 4, NULL,
            DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 162 * 2;
        m = e + 162;
        r = b;

        sp_4096_from_mp(b, 162, base);
        sp_4096_from_mp(e, 162, exp);
        sp_4096_from_mp(m, 162, mod);

        err = sp_4096_mod_exp_162(r, b, e, mp_count_bits(exp), m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_4096_to_mp(r, res);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 162U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }
    return err;
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[162 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    int err = MP_OKAY;
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 4096) {
        err = MP_READ_E;
    }
    else if (expBits > 4096) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 4096) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 162 * 4, NULL, DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 162 * 2;
        m = e + 162;
        r = b;

        sp_4096_from_mp(b, 162, base);
        sp_4096_from_mp(e, 162, exp);
        sp_4096_from_mp(m, 162, mod);

        err = sp_4096_mod_exp_162(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_4096_to_mp(r, res);
    }


#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 162U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }

    return err;
#endif
}

#ifdef WOLFSSL_HAVE_SP_DH

#ifdef HAVE_FFDHE_4096
SP_NOINLINE static void sp_4096_lshift_162(sp_digit* r, const sp_digit* a,
        byte n)
{
    sp_int_digit s;
    sp_int_digit t;

    s = (sp_int_digit)a[161];
    r[162] = s >> (26U - n);
    s = (sp_int_digit)(a[161]); t = (sp_int_digit)(a[160]);
    r[161] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[160]); t = (sp_int_digit)(a[159]);
    r[160] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[159]); t = (sp_int_digit)(a[158]);
    r[159] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[158]); t = (sp_int_digit)(a[157]);
    r[158] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[157]); t = (sp_int_digit)(a[156]);
    r[157] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[156]); t = (sp_int_digit)(a[155]);
    r[156] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[155]); t = (sp_int_digit)(a[154]);
    r[155] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[154]); t = (sp_int_digit)(a[153]);
    r[154] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[153]); t = (sp_int_digit)(a[152]);
    r[153] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[152]); t = (sp_int_digit)(a[151]);
    r[152] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[151]); t = (sp_int_digit)(a[150]);
    r[151] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[150]); t = (sp_int_digit)(a[149]);
    r[150] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[149]); t = (sp_int_digit)(a[148]);
    r[149] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[148]); t = (sp_int_digit)(a[147]);
    r[148] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[147]); t = (sp_int_digit)(a[146]);
    r[147] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[146]); t = (sp_int_digit)(a[145]);
    r[146] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[145]); t = (sp_int_digit)(a[144]);
    r[145] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[144]); t = (sp_int_digit)(a[143]);
    r[144] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[143]); t = (sp_int_digit)(a[142]);
    r[143] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[142]); t = (sp_int_digit)(a[141]);
    r[142] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[141]); t = (sp_int_digit)(a[140]);
    r[141] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[140]); t = (sp_int_digit)(a[139]);
    r[140] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[139]); t = (sp_int_digit)(a[138]);
    r[139] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[138]); t = (sp_int_digit)(a[137]);
    r[138] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[137]); t = (sp_int_digit)(a[136]);
    r[137] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[136]); t = (sp_int_digit)(a[135]);
    r[136] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[135]); t = (sp_int_digit)(a[134]);
    r[135] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[134]); t = (sp_int_digit)(a[133]);
    r[134] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[133]); t = (sp_int_digit)(a[132]);
    r[133] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[132]); t = (sp_int_digit)(a[131]);
    r[132] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[131]); t = (sp_int_digit)(a[130]);
    r[131] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[130]); t = (sp_int_digit)(a[129]);
    r[130] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[129]); t = (sp_int_digit)(a[128]);
    r[129] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[128]); t = (sp_int_digit)(a[127]);
    r[128] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[127]); t = (sp_int_digit)(a[126]);
    r[127] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[126]); t = (sp_int_digit)(a[125]);
    r[126] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[125]); t = (sp_int_digit)(a[124]);
    r[125] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[124]); t = (sp_int_digit)(a[123]);
    r[124] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[123]); t = (sp_int_digit)(a[122]);
    r[123] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[122]); t = (sp_int_digit)(a[121]);
    r[122] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[121]); t = (sp_int_digit)(a[120]);
    r[121] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[120]); t = (sp_int_digit)(a[119]);
    r[120] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[119]); t = (sp_int_digit)(a[118]);
    r[119] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[118]); t = (sp_int_digit)(a[117]);
    r[118] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[117]); t = (sp_int_digit)(a[116]);
    r[117] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[116]); t = (sp_int_digit)(a[115]);
    r[116] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[115]); t = (sp_int_digit)(a[114]);
    r[115] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[114]); t = (sp_int_digit)(a[113]);
    r[114] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[113]); t = (sp_int_digit)(a[112]);
    r[113] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[112]); t = (sp_int_digit)(a[111]);
    r[112] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[111]); t = (sp_int_digit)(a[110]);
    r[111] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[110]); t = (sp_int_digit)(a[109]);
    r[110] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[109]); t = (sp_int_digit)(a[108]);
    r[109] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[108]); t = (sp_int_digit)(a[107]);
    r[108] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[107]); t = (sp_int_digit)(a[106]);
    r[107] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[106]); t = (sp_int_digit)(a[105]);
    r[106] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[105]); t = (sp_int_digit)(a[104]);
    r[105] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[104]); t = (sp_int_digit)(a[103]);
    r[104] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[103]); t = (sp_int_digit)(a[102]);
    r[103] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[102]); t = (sp_int_digit)(a[101]);
    r[102] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[101]); t = (sp_int_digit)(a[100]);
    r[101] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[100]); t = (sp_int_digit)(a[99]);
    r[100] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[99]); t = (sp_int_digit)(a[98]);
    r[99] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[98]); t = (sp_int_digit)(a[97]);
    r[98] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[97]); t = (sp_int_digit)(a[96]);
    r[97] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[96]); t = (sp_int_digit)(a[95]);
    r[96] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[95]); t = (sp_int_digit)(a[94]);
    r[95] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[94]); t = (sp_int_digit)(a[93]);
    r[94] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[93]); t = (sp_int_digit)(a[92]);
    r[93] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[92]); t = (sp_int_digit)(a[91]);
    r[92] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[91]); t = (sp_int_digit)(a[90]);
    r[91] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[90]); t = (sp_int_digit)(a[89]);
    r[90] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[89]); t = (sp_int_digit)(a[88]);
    r[89] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[88]); t = (sp_int_digit)(a[87]);
    r[88] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[87]); t = (sp_int_digit)(a[86]);
    r[87] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[86]); t = (sp_int_digit)(a[85]);
    r[86] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[85]); t = (sp_int_digit)(a[84]);
    r[85] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[84]); t = (sp_int_digit)(a[83]);
    r[84] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[83]); t = (sp_int_digit)(a[82]);
    r[83] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[82]); t = (sp_int_digit)(a[81]);
    r[82] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[81]); t = (sp_int_digit)(a[80]);
    r[81] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[80]); t = (sp_int_digit)(a[79]);
    r[80] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[79]); t = (sp_int_digit)(a[78]);
    r[79] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[78]); t = (sp_int_digit)(a[77]);
    r[78] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[77]); t = (sp_int_digit)(a[76]);
    r[77] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[76]); t = (sp_int_digit)(a[75]);
    r[76] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[75]); t = (sp_int_digit)(a[74]);
    r[75] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[74]); t = (sp_int_digit)(a[73]);
    r[74] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[73]); t = (sp_int_digit)(a[72]);
    r[73] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[72]); t = (sp_int_digit)(a[71]);
    r[72] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[71]); t = (sp_int_digit)(a[70]);
    r[71] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[70]); t = (sp_int_digit)(a[69]);
    r[70] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[69]); t = (sp_int_digit)(a[68]);
    r[69] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[68]); t = (sp_int_digit)(a[67]);
    r[68] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[67]); t = (sp_int_digit)(a[66]);
    r[67] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[66]); t = (sp_int_digit)(a[65]);
    r[66] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[65]); t = (sp_int_digit)(a[64]);
    r[65] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[64]); t = (sp_int_digit)(a[63]);
    r[64] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[63]); t = (sp_int_digit)(a[62]);
    r[63] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[62]); t = (sp_int_digit)(a[61]);
    r[62] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[61]); t = (sp_int_digit)(a[60]);
    r[61] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[60]); t = (sp_int_digit)(a[59]);
    r[60] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[59]); t = (sp_int_digit)(a[58]);
    r[59] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[58]); t = (sp_int_digit)(a[57]);
    r[58] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[57]); t = (sp_int_digit)(a[56]);
    r[57] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[56]); t = (sp_int_digit)(a[55]);
    r[56] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[55]); t = (sp_int_digit)(a[54]);
    r[55] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[54]); t = (sp_int_digit)(a[53]);
    r[54] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[53]); t = (sp_int_digit)(a[52]);
    r[53] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[52]); t = (sp_int_digit)(a[51]);
    r[52] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[51]); t = (sp_int_digit)(a[50]);
    r[51] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[50]); t = (sp_int_digit)(a[49]);
    r[50] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[49]); t = (sp_int_digit)(a[48]);
    r[49] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[48]); t = (sp_int_digit)(a[47]);
    r[48] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[47]); t = (sp_int_digit)(a[46]);
    r[47] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[46]); t = (sp_int_digit)(a[45]);
    r[46] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[45]); t = (sp_int_digit)(a[44]);
    r[45] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[44]); t = (sp_int_digit)(a[43]);
    r[44] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[43]); t = (sp_int_digit)(a[42]);
    r[43] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[42]); t = (sp_int_digit)(a[41]);
    r[42] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[41]); t = (sp_int_digit)(a[40]);
    r[41] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[40]); t = (sp_int_digit)(a[39]);
    r[40] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[39]); t = (sp_int_digit)(a[38]);
    r[39] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[38]); t = (sp_int_digit)(a[37]);
    r[38] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[37]); t = (sp_int_digit)(a[36]);
    r[37] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[36]); t = (sp_int_digit)(a[35]);
    r[36] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[35]); t = (sp_int_digit)(a[34]);
    r[35] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[34]); t = (sp_int_digit)(a[33]);
    r[34] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[33]); t = (sp_int_digit)(a[32]);
    r[33] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[32]); t = (sp_int_digit)(a[31]);
    r[32] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[31]); t = (sp_int_digit)(a[30]);
    r[31] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[30]); t = (sp_int_digit)(a[29]);
    r[30] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[29]); t = (sp_int_digit)(a[28]);
    r[29] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[28]); t = (sp_int_digit)(a[27]);
    r[28] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[27]); t = (sp_int_digit)(a[26]);
    r[27] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[26]); t = (sp_int_digit)(a[25]);
    r[26] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[25]); t = (sp_int_digit)(a[24]);
    r[25] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[24]); t = (sp_int_digit)(a[23]);
    r[24] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[23]); t = (sp_int_digit)(a[22]);
    r[23] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[22]); t = (sp_int_digit)(a[21]);
    r[22] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[21]); t = (sp_int_digit)(a[20]);
    r[21] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[20]); t = (sp_int_digit)(a[19]);
    r[20] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[19]); t = (sp_int_digit)(a[18]);
    r[19] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[18]); t = (sp_int_digit)(a[17]);
    r[18] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[17]); t = (sp_int_digit)(a[16]);
    r[17] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[16]); t = (sp_int_digit)(a[15]);
    r[16] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[15]); t = (sp_int_digit)(a[14]);
    r[15] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[14]); t = (sp_int_digit)(a[13]);
    r[14] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[13]); t = (sp_int_digit)(a[12]);
    r[13] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[12]); t = (sp_int_digit)(a[11]);
    r[12] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[11]); t = (sp_int_digit)(a[10]);
    r[11] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[10]); t = (sp_int_digit)(a[9]);
    r[10] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[9]); t = (sp_int_digit)(a[8]);
    r[9] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[8]); t = (sp_int_digit)(a[7]);
    r[8] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[7]); t = (sp_int_digit)(a[6]);
    r[7] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[6]); t = (sp_int_digit)(a[5]);
    r[6] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[5]); t = (sp_int_digit)(a[4]);
    r[5] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[4]); t = (sp_int_digit)(a[3]);
    r[4] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[3]); t = (sp_int_digit)(a[2]);
    r[3] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[2]); t = (sp_int_digit)(a[1]);
    r[2] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    s = (sp_int_digit)(a[1]); t = (sp_int_digit)(a[0]);
    r[1] = ((s << n) | (t >> (26U - n))) & 0x3ffffff;
    r[0] = (a[0] << n) & 0x3ffffff;
}

/* Modular exponentiate 2 to the e mod m. (r = 2^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns  0 on success.
 * returns  MEMORY_E on dynamic memory allocation failure.
 * returns  MP_VAL when base is even.
 */
static int sp_4096_mod_exp_2_162(sp_digit* r, const sp_digit* e, int bits, const sp_digit* m)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* td = NULL;
#else
    sp_digit td[487];
#endif
    sp_digit* norm = NULL;
    sp_digit* tmp = NULL;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit o;
    int i;
    int c;
    byte y;
    int err = MP_OKAY;

    if (bits == 0) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 487, NULL,
                                DYNAMIC_TYPE_TMP_BUFFER);
        if (td == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        norm = td;
        tmp  = td + 324;
        XMEMSET(td, 0, sizeof(sp_digit) * 487);

        sp_4096_mont_setup(m, &mp);
        sp_4096_mont_norm_162(norm, m);

        bits = ((bits + 3) / 4) * 4;
        i = ((bits + 25) / 26) - 1;
        c = bits % 26;
        if (c == 0) {
            c = 26;
        }
        if (i < 162) {
            n = e[i--] << (32 - c);
        }
        else {
            n = 0;
            i--;
        }
        if (c < 4) {
            n |= e[i--] << (6 - c);
            c += 26;
        }
        y = (int)((n >> 28) & 0xf);
        n <<= 4;
        c -= 4;
        sp_4096_lshift_162(r, norm, (byte)y);
        while ((i >= 0) || (c >= 4)) {
            if (c >= 4) {
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c -= 4;
            }
            else if (c == 0) {
                n = e[i--] << 6;
                y = (byte)((n >> 28) & 0xf);
                n <<= 4;
                c = 22;
            }
            else {
                y = (byte)((n >> 28) & 0xf);
                n = e[i--] << 6;
                c = 4 - c;
                y |= (byte)((n >> (32 - c)) & ((1 << c) - 1));
                n <<= c;
                c = 26 - c;
            }

            sp_4096_mont_sqr_162(r, r, m, mp);
            sp_4096_mont_sqr_162(r, r, m, mp);
            sp_4096_mont_sqr_162(r, r, m, mp);
            sp_4096_mont_sqr_162(r, r, m, mp);

            sp_4096_lshift_162(r, r, (byte)y);
            sp_4096_mul_d_162(tmp, norm, (r[158] << 12) + (r[157] >> 14));
            r[158] = 0;
            r[157] &= 0x3fffL;
            (void)sp_4096_add_162(r, r, tmp);
            sp_4096_norm_162(r);
            o = sp_4096_cmp_162(r, m);
            sp_4096_cond_sub_162(r, r, m, ~(o >> 31));
        }

        sp_4096_mont_reduce_162(r, m, mp);
        n = sp_4096_cmp_162(r, m);
        sp_4096_cond_sub_162(r, r, m, ~(n >> 31));
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

#endif /* HAVE_FFDHE_4096 */

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base     Base.
 * exp      Array of bytes that is the exponent.
 * expLen   Length of data, in bytes, in exponent.
 * mod      Modulus.
 * out      Buffer to hold big-endian bytes of exponentiation result.
 *          Must be at least 512 bytes long.
 * outLen   Length, in bytes, of exponentiation result.
 * returns 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_DhExp_4096(const mp_int* base, const byte* exp, word32 expLen,
    const mp_int* mod, byte* out, word32* outLen)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* b = NULL;
#else
    sp_digit b[162 * 4];
#endif
    sp_digit* e = NULL;
    sp_digit* m = NULL;
    sp_digit* r = NULL;
    word32 i;
    int err = MP_OKAY;

    if (mp_count_bits(base) > 4096) {
        err = MP_READ_E;
    }
    else if (expLen > 512U) {
        err = MP_READ_E;
    }
    else if (mp_count_bits(mod) != 4096) {
        err = MP_READ_E;
    }
    else if (mp_iseven(mod)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        b = (sp_digit*)XMALLOC(sizeof(sp_digit) * 162 * 4, NULL,
            DYNAMIC_TYPE_DH);
        if (b == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        e = b + 162 * 2;
        m = e + 162;
        r = b;

        sp_4096_from_mp(b, 162, base);
        sp_4096_from_bin(e, 162, exp, expLen);
        sp_4096_from_mp(m, 162, mod);

    #ifdef HAVE_FFDHE_4096
        if (base->used == 1 && base->dp[0] == 2U &&
                ((m[157] << 2) | (m[156] >> 24)) == 0xffffL) {
            err = sp_4096_mod_exp_2_162(r, e, expLen * 8U, m);
        }
        else {
    #endif
            err = sp_4096_mod_exp_162(r, b, e, expLen * 8U, m, 0);
    #ifdef HAVE_FFDHE_4096
        }
    #endif
    }

    if (err == MP_OKAY) {
        sp_4096_to_bin_162(r, out);
        *outLen = 512;
        for (i=0; i<512U && out[i] == 0U; i++) {
            /* Search for first non-zero. */
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (b != NULL)
#endif
    {
        /* only "e" is sensitive and needs zeroized */
        if (e != NULL)
            ForceZero(e, sizeof(sp_digit) * 162U);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(b, NULL, DYNAMIC_TYPE_DH);
    #endif
    }

    return err;
}
#endif /* WOLFSSL_HAVE_SP_DH */

#endif /* WOLFSSL_HAVE_SP_DH | (WOLFSSL_HAVE_SP_RSA & !WOLFSSL_RSA_PUBLIC_ONLY) */

#endif /* WOLFSSL_SP_SMALL */
#endif /* WOLFSSL_SP_4096 */

#endif /* WOLFSSL_HAVE_SP_RSA | WOLFSSL_HAVE_SP_DH */
#ifdef WOLFSSL_HAVE_SP_ECC
#ifndef WOLFSSL_SP_NO_256

/* Point structure to use. */
typedef struct sp_point_256 {
    /* X ordinate of point. */
    sp_digit x[2 * 9];
    /* Y ordinate of point. */
    sp_digit y[2 * 9];
    /* Z ordinate of point. */
    sp_digit z[2 * 9];
    /* Indicates point is at infinity. */
    int infinity;
} sp_point_256;

/* The modulus (prime) of the curve P256. */
static const sp_digit p256_mod[9] = {
    0x1fffffff,0x1fffffff,0x1fffffff,0x000001ff,0x00000000,0x00000000,
    0x00040000,0x1fe00000,0x00ffffff
};
/* The Montgomery normalizer for modulus of the curve P256. */
static const sp_digit p256_norm_mod[9] = {
    0x00000001,0x00000000,0x00000000,0x1ffffe00,0x1fffffff,0x1fffffff,
    0x1ffbffff,0x001fffff,0x00000000
};
/* The Montgomery multiplier for modulus of the curve P256. */
static const sp_digit p256_mp_mod = 0x0000001;
#if defined(WOLFSSL_VALIDATE_ECC_KEYGEN) || defined(HAVE_ECC_SIGN) || \
                                            defined(HAVE_ECC_VERIFY)
/* The order of the curve P256. */
static const sp_digit p256_order[9] = {
    0x1c632551,0x1dce5617,0x05e7a13c,0x0df55b4e,0x1ffffbce,0x1fffffff,
    0x0003ffff,0x1fe00000,0x00ffffff
};
#endif
/* The order of the curve P256 minus 2. */
static const sp_digit p256_order2[9] = {
    0x1c63254f,0x1dce5617,0x05e7a13c,0x0df55b4e,0x1ffffbce,0x1fffffff,
    0x0003ffff,0x1fe00000,0x00ffffff
};
#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
/* The Montgomery normalizer for order of the curve P256. */
static const sp_digit p256_norm_order[9] = {
    0x039cdaaf,0x0231a9e8,0x1a185ec3,0x120aa4b1,0x00000431,0x00000000,
    0x1ffc0000,0x001fffff,0x00000000
};
#endif
#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
/* The Montgomery multiplier for order of the curve P256. */
static const sp_digit p256_mp_order = 0xe00bc4f;
#endif
/* The base point of curve P256. */
static const sp_point_256 p256_base = {
    /* X ordinate */
    {
        0x1898c296,0x0509ca2e,0x1acce83d,0x06fb025b,0x040f2770,0x1372b1d2,
        0x091fe2f3,0x1e5c2588,0x006b17d1,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0
    },
    /* Y ordinate */
    {
        0x17bf51f5,0x1db20341,0x0c57b3b2,0x1c66aed6,0x19e162bc,0x15a53e07,
        0x1e6e3b9f,0x1c5fc34f,0x004fe342,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0
    },
    /* Z ordinate */
    {
        0x00000001,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,
        0x00000000,0x00000000,0x00000000,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0
    },
    /* infinity */
    0
};
#if defined(HAVE_ECC_CHECK_KEY) || defined(HAVE_COMP_KEY)
static const sp_digit p256_b[9] = {
    0x07d2604b,0x1e71e1f1,0x14ec3d8e,0x1a0d6198,0x086bc651,0x1eaabb4c,
    0x0f9ecfae,0x1b154752,0x005ac635
};
#endif

#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_256_mul_9(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    int i;
    int imax;
    int k;
    sp_uint64 c;
    sp_uint64 lo;

    c = ((sp_uint64)a[8]) * b[8];
    r[17] = (sp_digit)(c >> 29);
    c &= 0x1fffffff;
    for (k = 15; k >= 0; k--) {
        if (k >= 9) {
            i = k - 8;
            imax = 8;
        }
        else {
            i = 0;
            imax = k;
        }
        lo = 0;
        for (; i <= imax; i++) {
            lo += ((sp_uint64)a[i]) * b[k - i];
        }
        c += lo >> 29;
        r[k + 2] += (sp_digit)(c >> 29);
        r[k + 1]  = (sp_digit)(c & 0x1fffffff);
        c = lo & 0x1fffffff;
    }
    r[0] = (sp_digit)c;
}

#else
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_256_mul_9(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    sp_int64 t0;
    sp_int64 t1;
    sp_digit t[9];

    t0 = ((sp_int64)a[ 0]) * b[ 0];
    t1 = ((sp_int64)a[ 0]) * b[ 1]
       + ((sp_int64)a[ 1]) * b[ 0];
    t[ 0] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_int64)a[ 0]) * b[ 2]
       + ((sp_int64)a[ 1]) * b[ 1]
       + ((sp_int64)a[ 2]) * b[ 0];
    t[ 1] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_int64)a[ 0]) * b[ 3]
       + ((sp_int64)a[ 1]) * b[ 2]
       + ((sp_int64)a[ 2]) * b[ 1]
       + ((sp_int64)a[ 3]) * b[ 0];
    t[ 2] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_int64)a[ 0]) * b[ 4]
       + ((sp_int64)a[ 1]) * b[ 3]
       + ((sp_int64)a[ 2]) * b[ 2]
       + ((sp_int64)a[ 3]) * b[ 1]
       + ((sp_int64)a[ 4]) * b[ 0];
    t[ 3] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_int64)a[ 0]) * b[ 5]
       + ((sp_int64)a[ 1]) * b[ 4]
       + ((sp_int64)a[ 2]) * b[ 3]
       + ((sp_int64)a[ 3]) * b[ 2]
       + ((sp_int64)a[ 4]) * b[ 1]
       + ((sp_int64)a[ 5]) * b[ 0];
    t[ 4] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_int64)a[ 0]) * b[ 6]
       + ((sp_int64)a[ 1]) * b[ 5]
       + ((sp_int64)a[ 2]) * b[ 4]
       + ((sp_int64)a[ 3]) * b[ 3]
       + ((sp_int64)a[ 4]) * b[ 2]
       + ((sp_int64)a[ 5]) * b[ 1]
       + ((sp_int64)a[ 6]) * b[ 0];
    t[ 5] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_int64)a[ 0]) * b[ 7]
       + ((sp_int64)a[ 1]) * b[ 6]
       + ((sp_int64)a[ 2]) * b[ 5]
       + ((sp_int64)a[ 3]) * b[ 4]
       + ((sp_int64)a[ 4]) * b[ 3]
       + ((sp_int64)a[ 5]) * b[ 2]
       + ((sp_int64)a[ 6]) * b[ 1]
       + ((sp_int64)a[ 7]) * b[ 0];
    t[ 6] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_int64)a[ 0]) * b[ 8]
       + ((sp_int64)a[ 1]) * b[ 7]
       + ((sp_int64)a[ 2]) * b[ 6]
       + ((sp_int64)a[ 3]) * b[ 5]
       + ((sp_int64)a[ 4]) * b[ 4]
       + ((sp_int64)a[ 5]) * b[ 3]
       + ((sp_int64)a[ 6]) * b[ 2]
       + ((sp_int64)a[ 7]) * b[ 1]
       + ((sp_int64)a[ 8]) * b[ 0];
    t[ 7] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_int64)a[ 1]) * b[ 8]
       + ((sp_int64)a[ 2]) * b[ 7]
       + ((sp_int64)a[ 3]) * b[ 6]
       + ((sp_int64)a[ 4]) * b[ 5]
       + ((sp_int64)a[ 5]) * b[ 4]
       + ((sp_int64)a[ 6]) * b[ 3]
       + ((sp_int64)a[ 7]) * b[ 2]
       + ((sp_int64)a[ 8]) * b[ 1];
    t[ 8] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_int64)a[ 2]) * b[ 8]
       + ((sp_int64)a[ 3]) * b[ 7]
       + ((sp_int64)a[ 4]) * b[ 6]
       + ((sp_int64)a[ 5]) * b[ 5]
       + ((sp_int64)a[ 6]) * b[ 4]
       + ((sp_int64)a[ 7]) * b[ 3]
       + ((sp_int64)a[ 8]) * b[ 2];
    r[ 9] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_int64)a[ 3]) * b[ 8]
       + ((sp_int64)a[ 4]) * b[ 7]
       + ((sp_int64)a[ 5]) * b[ 6]
       + ((sp_int64)a[ 6]) * b[ 5]
       + ((sp_int64)a[ 7]) * b[ 4]
       + ((sp_int64)a[ 8]) * b[ 3];
    r[10] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_int64)a[ 4]) * b[ 8]
       + ((sp_int64)a[ 5]) * b[ 7]
       + ((sp_int64)a[ 6]) * b[ 6]
       + ((sp_int64)a[ 7]) * b[ 5]
       + ((sp_int64)a[ 8]) * b[ 4];
    r[11] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_int64)a[ 5]) * b[ 8]
       + ((sp_int64)a[ 6]) * b[ 7]
       + ((sp_int64)a[ 7]) * b[ 6]
       + ((sp_int64)a[ 8]) * b[ 5];
    r[12] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_int64)a[ 6]) * b[ 8]
       + ((sp_int64)a[ 7]) * b[ 7]
       + ((sp_int64)a[ 8]) * b[ 6];
    r[13] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = ((sp_int64)a[ 7]) * b[ 8]
       + ((sp_int64)a[ 8]) * b[ 7];
    r[14] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = ((sp_int64)a[ 8]) * b[ 8];
    r[15] = t1 & 0x1fffffff; t0 += t1 >> 29;
    r[16] = t0 & 0x1fffffff;
    r[17] = (sp_digit)(t0 >> 29);
    XMEMCPY(r, t, sizeof(t));
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_256_sqr_9(sp_digit* r, const sp_digit* a)
{
    int i;
    int imax;
    int k;
    sp_uint64 c;
    sp_uint64 t;

    c = ((sp_uint64)a[8]) * a[8];
    r[17] = (sp_digit)(c >> 29);
    c = (c & 0x1fffffff) << 29;
    for (k = 15; k >= 0; k--) {
        i = (k + 1) / 2;
        if ((k & 1) == 0) {
           c += ((sp_uint64)a[i]) * a[i];
           i++;
        }
        if (k < 8) {
            imax = k;
        }
        else {
            imax = 8;
        }
        t = 0;
        for (; i <= imax; i++) {
            t += ((sp_uint64)a[i]) * a[k - i];
        }
        c += t * 2;

        r[k + 2] += (sp_digit) (c >> 58);
        r[k + 1]  = (sp_digit)((c >> 29) & 0x1fffffff);
        c = (c & 0x1fffffff) << 29;
    }
    r[0] = (sp_digit)(c >> 29);
}

#else
/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_256_sqr_9(sp_digit* r, const sp_digit* a)
{
    sp_int64 t0;
    sp_int64 t1;
    sp_digit t[9];

    t0 =  ((sp_int64)a[ 0]) * a[ 0];
    t1 = (((sp_int64)a[ 0]) * a[ 1]) * 2;
    t[ 0] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_int64)a[ 0]) * a[ 2]) * 2
       +  ((sp_int64)a[ 1]) * a[ 1];
    t[ 1] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_int64)a[ 0]) * a[ 3]
       +  ((sp_int64)a[ 1]) * a[ 2]) * 2;
    t[ 2] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_int64)a[ 0]) * a[ 4]
       +  ((sp_int64)a[ 1]) * a[ 3]) * 2
       +  ((sp_int64)a[ 2]) * a[ 2];
    t[ 3] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_int64)a[ 0]) * a[ 5]
       +  ((sp_int64)a[ 1]) * a[ 4]
       +  ((sp_int64)a[ 2]) * a[ 3]) * 2;
    t[ 4] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_int64)a[ 0]) * a[ 6]
       +  ((sp_int64)a[ 1]) * a[ 5]
       +  ((sp_int64)a[ 2]) * a[ 4]) * 2
       +  ((sp_int64)a[ 3]) * a[ 3];
    t[ 5] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_int64)a[ 0]) * a[ 7]
       +  ((sp_int64)a[ 1]) * a[ 6]
       +  ((sp_int64)a[ 2]) * a[ 5]
       +  ((sp_int64)a[ 3]) * a[ 4]) * 2;
    t[ 6] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_int64)a[ 0]) * a[ 8]
       +  ((sp_int64)a[ 1]) * a[ 7]
       +  ((sp_int64)a[ 2]) * a[ 6]
       +  ((sp_int64)a[ 3]) * a[ 5]) * 2
       +  ((sp_int64)a[ 4]) * a[ 4];
    t[ 7] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_int64)a[ 1]) * a[ 8]
       +  ((sp_int64)a[ 2]) * a[ 7]
       +  ((sp_int64)a[ 3]) * a[ 6]
       +  ((sp_int64)a[ 4]) * a[ 5]) * 2;
    t[ 8] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_int64)a[ 2]) * a[ 8]
       +  ((sp_int64)a[ 3]) * a[ 7]
       +  ((sp_int64)a[ 4]) * a[ 6]) * 2
       +  ((sp_int64)a[ 5]) * a[ 5];
    r[ 9] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_int64)a[ 3]) * a[ 8]
       +  ((sp_int64)a[ 4]) * a[ 7]
       +  ((sp_int64)a[ 5]) * a[ 6]) * 2;
    r[10] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_int64)a[ 4]) * a[ 8]
       +  ((sp_int64)a[ 5]) * a[ 7]) * 2
       +  ((sp_int64)a[ 6]) * a[ 6];
    r[11] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_int64)a[ 5]) * a[ 8]
       +  ((sp_int64)a[ 6]) * a[ 7]) * 2;
    r[12] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 = (((sp_int64)a[ 6]) * a[ 8]) * 2
       +  ((sp_int64)a[ 7]) * a[ 7];
    r[13] = t1 & 0x1fffffff; t0 += t1 >> 29;
    t1 = (((sp_int64)a[ 7]) * a[ 8]) * 2;
    r[14] = t0 & 0x1fffffff; t1 += t0 >> 29;
    t0 =  ((sp_int64)a[ 8]) * a[ 8];
    r[15] = t1 & 0x1fffffff; t0 += t1 >> 29;
    r[16] = t0 & 0x1fffffff;
    r[17] = (sp_digit)(t0 >> 29);
    XMEMCPY(r, t, sizeof(t));
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_256_add_9(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 9; i++) {
        r[i] = a[i] + b[i];
    }

    return 0;
}
#else
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_256_add_9(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    r[ 0] = a[ 0] + b[ 0];
    r[ 1] = a[ 1] + b[ 1];
    r[ 2] = a[ 2] + b[ 2];
    r[ 3] = a[ 3] + b[ 3];
    r[ 4] = a[ 4] + b[ 4];
    r[ 5] = a[ 5] + b[ 5];
    r[ 6] = a[ 6] + b[ 6];
    r[ 7] = a[ 7] + b[ 7];
    r[ 8] = a[ 8] + b[ 8];

    return 0;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_256_sub_9(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 9; i++) {
        r[i] = a[i] - b[i];
    }

    return 0;
}

#else
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_256_sub_9(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    r[ 0] = a[ 0] - b[ 0];
    r[ 1] = a[ 1] - b[ 1];
    r[ 2] = a[ 2] - b[ 2];
    r[ 3] = a[ 3] - b[ 3];
    r[ 4] = a[ 4] - b[ 4];
    r[ 5] = a[ 5] - b[ 5];
    r[ 6] = a[ 6] - b[ 6];
    r[ 7] = a[ 7] - b[ 7];
    r[ 8] = a[ 8] - b[ 8];

    return 0;
}

#endif /* WOLFSSL_SP_SMALL */
/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  A multi-precision integer.
 */
static void sp_256_from_mp(sp_digit* r, int size, const mp_int* a)
{
#if DIGIT_BIT == 29
    int i;
    sp_digit j = (sp_digit)0 - (sp_digit)a->used;
    int o = 0;

    for (i = 0; i < size; i++) {
        sp_digit mask = (sp_digit)0 - (j >> 28);
        r[i] = a->dp[o] & mask;
        j++;
        o += (int)(j >> 28);
    }
#elif DIGIT_BIT > 29
    unsigned int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i] << s);
        r[j] &= 0x1fffffff;
        s = 29U - s;
        if (j + 1 >= size) {
            break;
        }
        /* lint allow cast of mismatch word32 and mp_digit */
        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
        while ((s + 29U) <= (word32)DIGIT_BIT) {
            s += 29U;
            r[j] &= 0x1fffffff;
            if (j + 1 >= size) {
                break;
            }
            if (s < (word32)DIGIT_BIT) {
                /* lint allow cast of mismatch word32 and mp_digit */
                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
            }
            else {
                r[++j] = (sp_digit)0;
            }
        }
        s = (word32)DIGIT_BIT - s;
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#else
    unsigned int i;
    int j = 0;
    int s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 29) {
            r[j] &= 0x1fffffff;
            if (j + 1 >= size) {
                break;
            }
            s = 29 - s;
            if (s == DIGIT_BIT) {
                r[++j] = 0;
                s = 0;
            }
            else {
                r[++j] = a->dp[i] >> s;
                s = DIGIT_BIT - s;
            }
        }
        else {
            s += DIGIT_BIT;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#endif
}

/* Convert a point of type ecc_point to type sp_point_256.
 *
 * p   Point of type sp_point_256 (result).
 * pm  Point of type ecc_point.
 */
static void sp_256_point_from_ecc_point_9(sp_point_256* p,
        const ecc_point* pm)
{
    XMEMSET(p->x, 0, sizeof(p->x));
    XMEMSET(p->y, 0, sizeof(p->y));
    XMEMSET(p->z, 0, sizeof(p->z));
    sp_256_from_mp(p->x, 9, pm->x);
    sp_256_from_mp(p->y, 9, pm->y);
    sp_256_from_mp(p->z, 9, pm->z);
    p->infinity = 0;
}

/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_256_to_mp(const sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (256 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
#if DIGIT_BIT == 29
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 9);
        r->used = 9;
        mp_clamp(r);
#elif DIGIT_BIT < 29
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 9; i++) {
            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 29) {
                s += DIGIT_BIT;
                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
                if (s == SP_WORD_SIZE) {
                    r->dp[j] = 0;
                }
                else {
                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 29 - s;
        }
        r->used = (256 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 9; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 29 >= DIGIT_BIT) {
    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 29 - s;
            }
            else {
                s += 29;
            }
        }
        r->used = (256 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Convert a point of type sp_point_256 to type ecc_point.
 *
 * p   Point of type sp_point_256.
 * pm  Point of type ecc_point (result).
 * returns MEMORY_E when allocation of memory in ecc_point fails otherwise
 * MP_OKAY.
 */
static int sp_256_point_to_ecc_point_9(const sp_point_256* p, ecc_point* pm)
{
    int err;

    err = sp_256_to_mp(p->x, pm->x);
    if (err == MP_OKAY) {
        err = sp_256_to_mp(p->y, pm->y);
    }
    if (err == MP_OKAY) {
        err = sp_256_to_mp(p->z, pm->z);
    }

    return err;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_256_cmp_9(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=8; i>=0; i--) {
        r |= (a[i] - b[i]) & ~(((sp_digit)0 - r) >> 28);
    }
#else
    r |= (a[ 8] - b[ 8]) & (0 - (sp_digit)1);
    r |= (a[ 7] - b[ 7]) & ~(((sp_digit)0 - r) >> 28);
    r |= (a[ 6] - b[ 6]) & ~(((sp_digit)0 - r) >> 28);
    r |= (a[ 5] - b[ 5]) & ~(((sp_digit)0 - r) >> 28);
    r |= (a[ 4] - b[ 4]) & ~(((sp_digit)0 - r) >> 28);
    r |= (a[ 3] - b[ 3]) & ~(((sp_digit)0 - r) >> 28);
    r |= (a[ 2] - b[ 2]) & ~(((sp_digit)0 - r) >> 28);
    r |= (a[ 1] - b[ 1]) & ~(((sp_digit)0 - r) >> 28);
    r |= (a[ 0] - b[ 0]) & ~(((sp_digit)0 - r) >> 28);
#endif /* WOLFSSL_SP_SMALL */

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_256_cond_sub_9(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 9; i++) {
        r[i] = a[i] - (b[i] & m);
    }
#else
    r[ 0] = a[ 0] - (b[ 0] & m);
    r[ 1] = a[ 1] - (b[ 1] & m);
    r[ 2] = a[ 2] - (b[ 2] & m);
    r[ 3] = a[ 3] - (b[ 3] & m);
    r[ 4] = a[ 4] - (b[ 4] & m);
    r[ 5] = a[ 5] - (b[ 5] & m);
    r[ 6] = a[ 6] - (b[ 6] & m);
    r[ 7] = a[ 7] - (b[ 7] & m);
    r[ 8] = a[ 8] - (b[ 8] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_256_mul_add_9(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifndef WOLFSSL_SP_LARGE_CODE
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 9; i++) {
        t += r[i];
        t += tb * a[i];
        r[i] = ((sp_digit)t) & 0x1fffffff;
        t >>= 29;
    }
    r[9] += (sp_digit)t;
#else
#ifdef WOLFSSL_SP_SMALL
    sp_int64 tb = b;
    sp_int64 t[4];
    int i;

    t[0] = 0;
    for (i = 0; i < 8; i += 4) {
        t[0] += (tb * a[i+0]) + r[i+0];
        t[1]  = (tb * a[i+1]) + r[i+1];
        t[2]  = (tb * a[i+2]) + r[i+2];
        t[3]  = (tb * a[i+3]) + r[i+3];
        r[i+0] = t[0] & 0x1fffffff;
        t[1] += t[0] >> 29;
        r[i+1] = t[1] & 0x1fffffff;
        t[2] += t[1] >> 29;
        r[i+2] = t[2] & 0x1fffffff;
        t[3] += t[2] >> 29;
        r[i+3] = t[3] & 0x1fffffff;
        t[0]  = t[3] >> 29;
    }
    t[0] += (tb * a[8]) + r[8];
    r[8] = t[0] & 0x1fffffff;
    r[9] +=  (sp_digit)(t[0] >> 29);
#else
    sp_int64 tb = b;
    sp_int64 t[8];
    int i;

    t[0] = 0;
    for (i = 0; i < 8; i += 8) {
        t[0] += (tb * a[i+0]) + r[i+0];
        t[1]  = (tb * a[i+1]) + r[i+1];
        t[2]  = (tb * a[i+2]) + r[i+2];
        t[3]  = (tb * a[i+3]) + r[i+3];
        t[4]  = (tb * a[i+4]) + r[i+4];
        t[5]  = (tb * a[i+5]) + r[i+5];
        t[6]  = (tb * a[i+6]) + r[i+6];
        t[7]  = (tb * a[i+7]) + r[i+7];
        r[i+0] = t[0] & 0x1fffffff;
        t[1] += t[0] >> 29;
        r[i+1] = t[1] & 0x1fffffff;
        t[2] += t[1] >> 29;
        r[i+2] = t[2] & 0x1fffffff;
        t[3] += t[2] >> 29;
        r[i+3] = t[3] & 0x1fffffff;
        t[4] += t[3] >> 29;
        r[i+4] = t[4] & 0x1fffffff;
        t[5] += t[4] >> 29;
        r[i+5] = t[5] & 0x1fffffff;
        t[6] += t[5] >> 29;
        r[i+6] = t[6] & 0x1fffffff;
        t[7] += t[6] >> 29;
        r[i+7] = t[7] & 0x1fffffff;
        t[0]  = t[7] >> 29;
    }
    t[0] += (tb * a[8]) + r[8];
    r[8] = t[0] & 0x1fffffff;
    r[9] +=  (sp_digit)(t[0] >> 29);
#endif /* WOLFSSL_SP_SMALL */
#endif /* !WOLFSSL_SP_LARGE_CODE */
}

/* Normalize the values in each word to 29 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_256_norm_9(sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    for (i = 0; i < 8; i++) {
        a[i+1] += a[i] >> 29;
        a[i] &= 0x1fffffff;
    }
#else
    a[1] += a[0] >> 29; a[0] &= 0x1fffffff;
    a[2] += a[1] >> 29; a[1] &= 0x1fffffff;
    a[3] += a[2] >> 29; a[2] &= 0x1fffffff;
    a[4] += a[3] >> 29; a[3] &= 0x1fffffff;
    a[5] += a[4] >> 29; a[4] &= 0x1fffffff;
    a[6] += a[5] >> 29; a[5] &= 0x1fffffff;
    a[7] += a[6] >> 29; a[6] &= 0x1fffffff;
    a[8] += a[7] >> 29; a[7] &= 0x1fffffff;
#endif /* WOLFSSL_SP_SMALL */
}

/* Shift the result in the high 256 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_256_mont_shift_9(sp_digit* r, const sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    sp_int64 n = a[8] >> 24;
    n += ((sp_int64)a[9]) << 5;

    for (i = 0; i < 8; i++) {
        r[i] = n & 0x1fffffff;
        n >>= 29;
        n += ((sp_int64)a[10 + i]) << 5;
    }
    r[8] = (sp_digit)n;
#else
    sp_int64 n = a[8] >> 24;
    n += ((sp_int64)a[9]) << 5;
    r[ 0] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[10]) << 5;
    r[ 1] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[11]) << 5;
    r[ 2] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[12]) << 5;
    r[ 3] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[13]) << 5;
    r[ 4] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[14]) << 5;
    r[ 5] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[15]) << 5;
    r[ 6] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[16]) << 5;
    r[ 7] = n & 0x1fffffff; n >>= 29; n += ((sp_int64)a[17]) << 5;
    r[8] = (sp_digit)n;
#endif /* WOLFSSL_SP_SMALL */
    XMEMSET(&r[9], 0, sizeof(*r) * 9U);
}

/* Reduce the number back to 256 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_256_mont_reduce_order_9(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;
    sp_digit over;

    sp_256_norm_9(a + 9);

    for (i=0; i<8; i++) {
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x1fffffff;
        sp_256_mul_add_9(a+i, m, mu);
        a[i+1] += a[i] >> 29;
    }
    mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0xffffffL;
    sp_256_mul_add_9(a+i, m, mu);
    a[i+1] += a[i] >> 29;
    a[i] &= 0x1fffffff;
    sp_256_mont_shift_9(a, a);
    over = a[8] >> 24;
    sp_256_cond_sub_9(a, a, m, ~((over - 1) >> 31));
    sp_256_norm_9(a);
}

/* Reduce the number back to 256 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_256_mont_reduce_9(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit am;

    (void)m;
    (void)mp;

    for (i = 0; i < 8; i++) {
        am = a[i] & 0x1fffffff;
        a[i + 3] += (am << 9) & 0x1fffffff;
        a[i + 4] += am >> 20;
        a[i + 6] += (am << 18) & 0x1fffffff;
        a[i + 7] += (am >> 11) - ((am << 21) & 0x1fffffff);
        a[i + 8] += -(am >> 8) + ((am << 24) & 0x1fffffff);
        a[i + 9] += am >> 5;

        a[i + 1] += a[i] >> 29;
    }
    am = a[8] & 0xffffff;
    a[8 + 3] += (am << 9) & 0x1fffffff;
    a[8 + 4] += am >> 20;
    a[8 + 6] += (am << 18) & 0x1fffffff;
    a[8 + 7] += (am >> 11) - ((am << 21) & 0x1fffffff);
    a[8 + 8] += -(am >> 8) + ((am << 24) & 0x1fffffff);
    a[8 + 9] += am >> 5;

    a[0] = (a[ 8] >> 24) + ((a[ 9] << 5) & 0x1fffffff);
    a[1] = (a[ 9] >> 24) + ((a[10] << 5) & 0x1fffffff);
    a[2] = (a[10] >> 24) + ((a[11] << 5) & 0x1fffffff);
    a[3] = (a[11] >> 24) + ((a[12] << 5) & 0x1fffffff);
    a[4] = (a[12] >> 24) + ((a[13] << 5) & 0x1fffffff);
    a[5] = (a[13] >> 24) + ((a[14] << 5) & 0x1fffffff);
    a[6] = (a[14] >> 24) + ((a[15] << 5) & 0x1fffffff);
    a[7] = (a[15] >> 24) + ((a[16] << 5) & 0x1fffffff);
    a[8] = (a[16] >> 24) +  (a[17] << 5);

    a[1] += a[0] >> 29; a[0] &= 0x1fffffff;
    a[2] += a[1] >> 29; a[1] &= 0x1fffffff;
    a[3] += a[2] >> 29; a[2] &= 0x1fffffff;
    a[4] += a[3] >> 29; a[3] &= 0x1fffffff;
    a[5] += a[4] >> 29; a[4] &= 0x1fffffff;
    a[6] += a[5] >> 29; a[5] &= 0x1fffffff;
    a[7] += a[6] >> 29; a[6] &= 0x1fffffff;
    a[8] += a[7] >> 29; a[7] &= 0x1fffffff;

    /* Get the bit over, if any. */
    am = a[8] >> 24;
    /* Create mask. */
    am = 0 - am;

    a[0] -= 0x1fffffff & am;
    a[1] -= 0x1fffffff & am;
    a[2] -= 0x1fffffff & am;
    a[3] -= 0x000001ff & am;
    /* p256_mod[4] is zero */
    /* p256_mod[5] is zero */
    a[6] -= 0x00040000 & am;
    a[7] -= 0x1fe00000 & am;
    a[8] -= 0x00ffffff & am;

    a[1] += a[0] >> 29; a[0] &= 0x1fffffff;
    a[2] += a[1] >> 29; a[1] &= 0x1fffffff;
    a[3] += a[2] >> 29; a[2] &= 0x1fffffff;
    a[4] += a[3] >> 29; a[3] &= 0x1fffffff;
    a[5] += a[4] >> 29; a[4] &= 0x1fffffff;
    a[6] += a[5] >> 29; a[5] &= 0x1fffffff;
    a[7] += a[6] >> 29; a[6] &= 0x1fffffff;
    a[8] += a[7] >> 29; a[7] &= 0x1fffffff;
}

/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_256_mont_mul_9(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_256_mul_9(r, a, b);
    sp_256_mont_reduce_9(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_256_mont_sqr_9(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_256_sqr_9(r, a);
    sp_256_mont_reduce_9(r, m, mp);
}

#if !defined(WOLFSSL_SP_SMALL) || defined(HAVE_COMP_KEY)
/* Square the Montgomery form number a number of times. (r = a ^ n mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * n   Number of times to square.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_256_mont_sqr_n_9(sp_digit* r,
    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
{
    sp_256_mont_sqr_9(r, a, m, mp);
    for (; n > 1; n--) {
        sp_256_mont_sqr_9(r, r, m, mp);
    }
}

#endif /* !WOLFSSL_SP_SMALL || HAVE_COMP_KEY */
#ifdef WOLFSSL_SP_SMALL
/* Mod-2 for the P256 curve. */
static const uint32_t p256_mod_minus_2[8] = {
    0xfffffffdU,0xffffffffU,0xffffffffU,0x00000000U,0x00000000U,0x00000000U,
    0x00000001U,0xffffffffU
};
#endif /* !WOLFSSL_SP_SMALL */

/* Invert the number, in Montgomery form, modulo the modulus (prime) of the
 * P256 curve. (r = 1 / a mod m)
 *
 * r   Inverse result.
 * a   Number to invert.
 * td  Temporary data.
 */
static void sp_256_mont_inv_9(sp_digit* r, const sp_digit* a, sp_digit* td)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* t = td;
    int i;

    XMEMCPY(t, a, sizeof(sp_digit) * 9);
    for (i=254; i>=0; i--) {
        sp_256_mont_sqr_9(t, t, p256_mod, p256_mp_mod);
        if (p256_mod_minus_2[i / 32] & ((sp_digit)1 << (i % 32)))
            sp_256_mont_mul_9(t, t, a, p256_mod, p256_mp_mod);
    }
    XMEMCPY(r, t, sizeof(sp_digit) * 9);
#else
    sp_digit* t1 = td;
    sp_digit* t2 = td + 2 * 9;
    sp_digit* t3 = td + 4 * 9;
    /* 0x2 */
    sp_256_mont_sqr_9(t1, a, p256_mod, p256_mp_mod);
    /* 0x3 */
    sp_256_mont_mul_9(t2, t1, a, p256_mod, p256_mp_mod);
    /* 0xc */
    sp_256_mont_sqr_n_9(t1, t2, 2, p256_mod, p256_mp_mod);
    /* 0xd */
    sp_256_mont_mul_9(t3, t1, a, p256_mod, p256_mp_mod);
    /* 0xf */
    sp_256_mont_mul_9(t2, t2, t1, p256_mod, p256_mp_mod);
    /* 0xf0 */
    sp_256_mont_sqr_n_9(t1, t2, 4, p256_mod, p256_mp_mod);
    /* 0xfd */
    sp_256_mont_mul_9(t3, t3, t1, p256_mod, p256_mp_mod);
    /* 0xff */
    sp_256_mont_mul_9(t2, t2, t1, p256_mod, p256_mp_mod);
    /* 0xff00 */
    sp_256_mont_sqr_n_9(t1, t2, 8, p256_mod, p256_mp_mod);
    /* 0xfffd */
    sp_256_mont_mul_9(t3, t3, t1, p256_mod, p256_mp_mod);
    /* 0xffff */
    sp_256_mont_mul_9(t2, t2, t1, p256_mod, p256_mp_mod);
    /* 0xffff0000 */
    sp_256_mont_sqr_n_9(t1, t2, 16, p256_mod, p256_mp_mod);
    /* 0xfffffffd */
    sp_256_mont_mul_9(t3, t3, t1, p256_mod, p256_mp_mod);
    /* 0xffffffff */
    sp_256_mont_mul_9(t2, t2, t1, p256_mod, p256_mp_mod);
    /* 0xffffffff00000000 */
    sp_256_mont_sqr_n_9(t1, t2, 32, p256_mod, p256_mp_mod);
    /* 0xffffffffffffffff */
    sp_256_mont_mul_9(t2, t2, t1, p256_mod, p256_mp_mod);
    /* 0xffffffff00000001 */
    sp_256_mont_mul_9(r, t1, a, p256_mod, p256_mp_mod);
    /* 0xffffffff000000010000000000000000000000000000000000000000 */
    sp_256_mont_sqr_n_9(r, r, 160, p256_mod, p256_mp_mod);
    /* 0xffffffff00000001000000000000000000000000ffffffffffffffff */
    sp_256_mont_mul_9(r, r, t2, p256_mod, p256_mp_mod);
    /* 0xffffffff00000001000000000000000000000000ffffffffffffffff00000000 */
    sp_256_mont_sqr_n_9(r, r, 32, p256_mod, p256_mp_mod);
    /* 0xffffffff00000001000000000000000000000000fffffffffffffffffffffffd */
    sp_256_mont_mul_9(r, r, t3, p256_mod, p256_mp_mod);
#endif /* WOLFSSL_SP_SMALL */
}

/* Map the Montgomery form projective coordinate point to an affine point.
 *
 * r  Resulting affine coordinate point.
 * p  Montgomery form projective coordinate point.
 * t  Temporary ordinate data.
 */
static void sp_256_map_9(sp_point_256* r, const sp_point_256* p,
    sp_digit* t)
{
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2*9;
    sp_int32 n;

    sp_256_mont_inv_9(t1, p->z, t + 2*9);

    sp_256_mont_sqr_9(t2, t1, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(t1, t2, t1, p256_mod, p256_mp_mod);

    /* x /= z^2 */
    sp_256_mont_mul_9(r->x, p->x, t2, p256_mod, p256_mp_mod);
    XMEMSET(r->x + 9, 0, sizeof(sp_digit) * 9U);
    sp_256_mont_reduce_9(r->x, p256_mod, p256_mp_mod);
    /* Reduce x to less than modulus */
    n = sp_256_cmp_9(r->x, p256_mod);
    sp_256_cond_sub_9(r->x, r->x, p256_mod, ~(n >> 28));
    sp_256_norm_9(r->x);

    /* y /= z^3 */
    sp_256_mont_mul_9(r->y, p->y, t1, p256_mod, p256_mp_mod);
    XMEMSET(r->y + 9, 0, sizeof(sp_digit) * 9U);
    sp_256_mont_reduce_9(r->y, p256_mod, p256_mp_mod);
    /* Reduce y to less than modulus */
    n = sp_256_cmp_9(r->y, p256_mod);
    sp_256_cond_sub_9(r->y, r->y, p256_mod, ~(n >> 28));
    sp_256_norm_9(r->y);

    XMEMSET(r->z, 0, sizeof(r->z) / 2);
    r->z[0] = 1;
}

/* Add two Montgomery form numbers (r = a + b % m).
 *
 * r   Result of addition.
 * a   First number to add in Montgomery form.
 * b   Second number to add in Montgomery form.
 * m   Modulus (prime).
 */
static void sp_256_mont_add_9(sp_digit* r, const sp_digit* a, const sp_digit* b,
        const sp_digit* m)
{
    sp_digit over;
    (void)sp_256_add_9(r, a, b);
    sp_256_norm_9(r);
    over = r[8] >> 24;
    sp_256_cond_sub_9(r, r, m, ~((over - 1) >> 31));
    sp_256_norm_9(r);
}

/* Double a Montgomery form number (r = a + a % m).
 *
 * r   Result of doubling.
 * a   Number to double in Montgomery form.
 * m   Modulus (prime).
 */
static void sp_256_mont_dbl_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    sp_digit over;
    (void)sp_256_add_9(r, a, a);
    sp_256_norm_9(r);
    over = r[8] >> 24;
    sp_256_cond_sub_9(r, r, m, ~((over - 1) >> 31));
    sp_256_norm_9(r);
}

/* Triple a Montgomery form number (r = a + a + a % m).
 *
 * r   Result of Tripling.
 * a   Number to triple in Montgomery form.
 * m   Modulus (prime).
 */
static void sp_256_mont_tpl_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    sp_digit over;
    (void)sp_256_add_9(r, a, a);
    sp_256_norm_9(r);
    over = r[8] >> 24;
    sp_256_cond_sub_9(r, r, m, ~((over - 1) >> 31));
    sp_256_norm_9(r);
    (void)sp_256_add_9(r, r, a);
    sp_256_norm_9(r);
    over = r[8] >> 24;
    sp_256_cond_sub_9(r, r, m, ~((over - 1) >> 31));
    sp_256_norm_9(r);
}

#ifdef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_256_cond_add_9(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 9; i++) {
        r[i] = a[i] + (b[i] & m);
    }
}
#endif /* WOLFSSL_SP_SMALL */

#ifndef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_256_cond_add_9(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    r[ 0] = a[ 0] + (b[ 0] & m);
    r[ 1] = a[ 1] + (b[ 1] & m);
    r[ 2] = a[ 2] + (b[ 2] & m);
    r[ 3] = a[ 3] + (b[ 3] & m);
    r[ 4] = a[ 4] + (b[ 4] & m);
    r[ 5] = a[ 5] + (b[ 5] & m);
    r[ 6] = a[ 6] + (b[ 6] & m);
    r[ 7] = a[ 7] + (b[ 7] & m);
    r[ 8] = a[ 8] + (b[ 8] & m);
}
#endif /* !WOLFSSL_SP_SMALL */

/* Subtract two Montgomery form numbers (r = a - b % m).
 *
 * r   Result of subtration.
 * a   Number to subtract from in Montgomery form.
 * b   Number to subtract with in Montgomery form.
 * m   Modulus (prime).
 */
static void sp_256_mont_sub_9(sp_digit* r, const sp_digit* a, const sp_digit* b,
        const sp_digit* m)
{
    (void)sp_256_sub_9(r, a, b);
    sp_256_norm_9(r);
    sp_256_cond_add_9(r, r, m, r[8] >> 24);
    sp_256_norm_9(r);
}

/* Shift number left one bit.
 * Bottom bit is lost.
 *
 * r  Result of shift.
 * a  Number to shift.
 */
SP_NOINLINE static void sp_256_rshift1_9(sp_digit* r, const sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<8; i++) {
        r[i] = (a[i] >> 1) + ((a[i + 1] << 28) & 0x1fffffff);
    }
#else
    r[0] = (a[0] >> 1) + ((a[1] << 28) & 0x1fffffff);
    r[1] = (a[1] >> 1) + ((a[2] << 28) & 0x1fffffff);
    r[2] = (a[2] >> 1) + ((a[3] << 28) & 0x1fffffff);
    r[3] = (a[3] >> 1) + ((a[4] << 28) & 0x1fffffff);
    r[4] = (a[4] >> 1) + ((a[5] << 28) & 0x1fffffff);
    r[5] = (a[5] >> 1) + ((a[6] << 28) & 0x1fffffff);
    r[6] = (a[6] >> 1) + ((a[7] << 28) & 0x1fffffff);
    r[7] = (a[7] >> 1) + ((a[8] << 28) & 0x1fffffff);
#endif
    r[8] = a[8] >> 1;
}

/* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
 *
 * r  Result of division by 2.
 * a  Number to divide.
 * m  Modulus (prime).
 */
static void sp_256_mont_div2_9(sp_digit* r, const sp_digit* a,
        const sp_digit* m)
{
    sp_256_cond_add_9(r, a, m, 0 - (a[0] & 1));
    sp_256_norm_9(r);
    sp_256_rshift1_9(r, r);
}

/* Double the Montgomery form projective point p.
 *
 * r  Result of doubling point.
 * p  Point to double.
 * t  Temporary ordinate data.
 */
static void sp_256_proj_point_dbl_9(sp_point_256* r, const sp_point_256* p,
    sp_digit* t)
{
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2*9;
    sp_digit* x;
    sp_digit* y;
    sp_digit* z;

    x = r->x;
    y = r->y;
    z = r->z;
    /* Put infinity into result. */
    if (r != p) {
        r->infinity = p->infinity;
    }

    /* T1 = Z * Z */
    sp_256_mont_sqr_9(t1, p->z, p256_mod, p256_mp_mod);
    /* Z = Y * Z */
    sp_256_mont_mul_9(z, p->y, p->z, p256_mod, p256_mp_mod);
    /* Z = 2Z */
    sp_256_mont_dbl_9(z, z, p256_mod);
    /* T2 = X - T1 */
    sp_256_mont_sub_9(t2, p->x, t1, p256_mod);
    /* T1 = X + T1 */
    sp_256_mont_add_9(t1, p->x, t1, p256_mod);
    /* T2 = T1 * T2 */
    sp_256_mont_mul_9(t2, t1, t2, p256_mod, p256_mp_mod);
    /* T1 = 3T2 */
    sp_256_mont_tpl_9(t1, t2, p256_mod);
    /* Y = 2Y */
    sp_256_mont_dbl_9(y, p->y, p256_mod);
    /* Y = Y * Y */
    sp_256_mont_sqr_9(y, y, p256_mod, p256_mp_mod);
    /* T2 = Y * Y */
    sp_256_mont_sqr_9(t2, y, p256_mod, p256_mp_mod);
    /* T2 = T2/2 */
    sp_256_mont_div2_9(t2, t2, p256_mod);
    /* Y = Y * X */
    sp_256_mont_mul_9(y, y, p->x, p256_mod, p256_mp_mod);
    /* X = T1 * T1 */
    sp_256_mont_sqr_9(x, t1, p256_mod, p256_mp_mod);
    /* X = X - Y */
    sp_256_mont_sub_9(x, x, y, p256_mod);
    /* X = X - Y */
    sp_256_mont_sub_9(x, x, y, p256_mod);
    /* Y = Y - X */
    sp_256_mont_sub_9(y, y, x, p256_mod);
    /* Y = Y * T1 */
    sp_256_mont_mul_9(y, y, t1, p256_mod, p256_mp_mod);
    /* Y = Y - T2 */
    sp_256_mont_sub_9(y, y, t2, p256_mod);
}

#ifdef WOLFSSL_SP_NONBLOCK
typedef struct sp_256_proj_point_dbl_9_ctx {
    int state;
    sp_digit* t1;
    sp_digit* t2;
    sp_digit* x;
    sp_digit* y;
    sp_digit* z;
} sp_256_proj_point_dbl_9_ctx;

/* Double the Montgomery form projective point p.
 *
 * r  Result of doubling point.
 * p  Point to double.
 * t  Temporary ordinate data.
 */
static int sp_256_proj_point_dbl_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
        const sp_point_256* p, sp_digit* t)
{
    int err = FP_WOULDBLOCK;
    sp_256_proj_point_dbl_9_ctx* ctx = (sp_256_proj_point_dbl_9_ctx*)sp_ctx->data;

    typedef char ctx_size_test[sizeof(sp_256_proj_point_dbl_9_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
    (void)sizeof(ctx_size_test);

    switch (ctx->state) {
    case 0:
        ctx->t1 = t;
        ctx->t2 = t + 2*9;
        ctx->x = r->x;
        ctx->y = r->y;
        ctx->z = r->z;

        /* Put infinity into result. */
        if (r != p) {
            r->infinity = p->infinity;
        }
        ctx->state = 1;
        break;
    case 1:
        /* T1 = Z * Z */
        sp_256_mont_sqr_9(ctx->t1, p->z, p256_mod, p256_mp_mod);
        ctx->state = 2;
        break;
    case 2:
        /* Z = Y * Z */
        sp_256_mont_mul_9(ctx->z, p->y, p->z, p256_mod, p256_mp_mod);
        ctx->state = 3;
        break;
    case 3:
        /* Z = 2Z */
        sp_256_mont_dbl_9(ctx->z, ctx->z, p256_mod);
        ctx->state = 4;
        break;
    case 4:
        /* T2 = X - T1 */
        sp_256_mont_sub_9(ctx->t2, p->x, ctx->t1, p256_mod);
        ctx->state = 5;
        break;
    case 5:
        /* T1 = X + T1 */
        sp_256_mont_add_9(ctx->t1, p->x, ctx->t1, p256_mod);
        ctx->state = 6;
        break;
    case 6:
        /* T2 = T1 * T2 */
        sp_256_mont_mul_9(ctx->t2, ctx->t1, ctx->t2, p256_mod, p256_mp_mod);
        ctx->state = 7;
        break;
    case 7:
        /* T1 = 3T2 */
        sp_256_mont_tpl_9(ctx->t1, ctx->t2, p256_mod);
        ctx->state = 8;
        break;
    case 8:
        /* Y = 2Y */
        sp_256_mont_dbl_9(ctx->y, p->y, p256_mod);
        ctx->state = 9;
        break;
    case 9:
        /* Y = Y * Y */
        sp_256_mont_sqr_9(ctx->y, ctx->y, p256_mod, p256_mp_mod);
        ctx->state = 10;
        break;
    case 10:
        /* T2 = Y * Y */
        sp_256_mont_sqr_9(ctx->t2, ctx->y, p256_mod, p256_mp_mod);
        ctx->state = 11;
        break;
    case 11:
        /* T2 = T2/2 */
        sp_256_mont_div2_9(ctx->t2, ctx->t2, p256_mod);
        ctx->state = 12;
        break;
    case 12:
        /* Y = Y * X */
        sp_256_mont_mul_9(ctx->y, ctx->y, p->x, p256_mod, p256_mp_mod);
        ctx->state = 13;
        break;
    case 13:
        /* X = T1 * T1 */
        sp_256_mont_sqr_9(ctx->x, ctx->t1, p256_mod, p256_mp_mod);
        ctx->state = 14;
        break;
    case 14:
        /* X = X - Y */
        sp_256_mont_sub_9(ctx->x, ctx->x, ctx->y, p256_mod);
        ctx->state = 15;
        break;
    case 15:
        /* X = X - Y */
        sp_256_mont_sub_9(ctx->x, ctx->x, ctx->y, p256_mod);
        ctx->state = 16;
        break;
    case 16:
        /* Y = Y - X */
        sp_256_mont_sub_9(ctx->y, ctx->y, ctx->x, p256_mod);
        ctx->state = 17;
        break;
    case 17:
        /* Y = Y * T1 */
        sp_256_mont_mul_9(ctx->y, ctx->y, ctx->t1, p256_mod, p256_mp_mod);
        ctx->state = 18;
        break;
    case 18:
        /* Y = Y - T2 */
        sp_256_mont_sub_9(ctx->y, ctx->y, ctx->t2, p256_mod);
        ctx->state = 19;
        /* fall-through */
    case 19:
        err = MP_OKAY;
        break;
    }

    if (err == MP_OKAY && ctx->state != 19) {
        err = FP_WOULDBLOCK;
    }

    return err;
}
#endif /* WOLFSSL_SP_NONBLOCK */
/* Compare two numbers to determine if they are equal.
 * Constant time implementation.
 *
 * a  First number to compare.
 * b  Second number to compare.
 * returns 1 when equal and 0 otherwise.
 */
static int sp_256_cmp_equal_9(const sp_digit* a, const sp_digit* b)
{
    return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2]) |
            (a[3] ^ b[3]) | (a[4] ^ b[4]) | (a[5] ^ b[5]) |
            (a[6] ^ b[6]) | (a[7] ^ b[7]) | (a[8] ^ b[8])) == 0;
}

/* Returns 1 if the number of zero.
 * Implementation is constant time.
 *
 * a  Number to check.
 * returns 1 if the number is zero and 0 otherwise.
 */
static int sp_256_iszero_9(const sp_digit* a)
{
    return (a[0] | a[1] | a[2] | a[3] | a[4] | a[5] | a[6] | a[7] |
            a[8]) == 0;
}


/* Add two Montgomery form projective points.
 *
 * r  Result of addition.
 * p  First point to add.
 * q  Second point to add.
 * t  Temporary ordinate data.
 */
static void sp_256_proj_point_add_9(sp_point_256* r,
        const sp_point_256* p, const sp_point_256* q, sp_digit* t)
{
    sp_digit* t6 = t;
    sp_digit* t1 = t + 2*9;
    sp_digit* t2 = t + 4*9;
    sp_digit* t3 = t + 6*9;
    sp_digit* t4 = t + 8*9;
    sp_digit* t5 = t + 10*9;

    /* U1 = X1*Z2^2 */
    sp_256_mont_sqr_9(t1, q->z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(t3, t1, q->z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(t1, t1, p->x, p256_mod, p256_mp_mod);
    /* U2 = X2*Z1^2 */
    sp_256_mont_sqr_9(t2, p->z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(t4, t2, p->z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(t2, t2, q->x, p256_mod, p256_mp_mod);
    /* S1 = Y1*Z2^3 */
    sp_256_mont_mul_9(t3, t3, p->y, p256_mod, p256_mp_mod);
    /* S2 = Y2*Z1^3 */
    sp_256_mont_mul_9(t4, t4, q->y, p256_mod, p256_mp_mod);

    /* Check double */
    if ((~p->infinity) & (~q->infinity) &
            sp_256_cmp_equal_9(t2, t1) &
            sp_256_cmp_equal_9(t4, t3)) {
        sp_256_proj_point_dbl_9(r, p, t);
    }
    else {
        sp_digit* x = t6;
        sp_digit* y = t1;
        sp_digit* z = t2;

        /* H = U2 - U1 */
        sp_256_mont_sub_9(t2, t2, t1, p256_mod);
        /* R = S2 - S1 */
        sp_256_mont_sub_9(t4, t4, t3, p256_mod);
        /* X3 = R^2 - H^3 - 2*U1*H^2 */
        sp_256_mont_sqr_9(t5, t2, p256_mod, p256_mp_mod);
        sp_256_mont_mul_9(y, t1, t5, p256_mod, p256_mp_mod);
        sp_256_mont_mul_9(t5, t5, t2, p256_mod, p256_mp_mod);
        /* Z3 = H*Z1*Z2 */
        sp_256_mont_mul_9(z, p->z, t2, p256_mod, p256_mp_mod);
        sp_256_mont_mul_9(z, z, q->z, p256_mod, p256_mp_mod);
        sp_256_mont_sqr_9(x, t4, p256_mod, p256_mp_mod);
        sp_256_mont_sub_9(x, x, t5, p256_mod);
        sp_256_mont_mul_9(t5, t5, t3, p256_mod, p256_mp_mod);
        sp_256_mont_dbl_9(t3, y, p256_mod);
        sp_256_mont_sub_9(x, x, t3, p256_mod);
        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
        sp_256_mont_sub_9(y, y, x, p256_mod);
        sp_256_mont_mul_9(y, y, t4, p256_mod, p256_mp_mod);
        sp_256_mont_sub_9(y, y, t5, p256_mod);
        {
            int i;
            sp_digit maskp = 0 - (q->infinity & (!p->infinity));
            sp_digit maskq = 0 - (p->infinity & (!q->infinity));
            sp_digit maskt = ~(maskp | maskq);
            sp_digit inf = (sp_digit)(p->infinity & q->infinity);

            for (i = 0; i < 9; i++) {
                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
                          (x[i] & maskt);
            }
            for (i = 0; i < 9; i++) {
                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
                          (y[i] & maskt);
            }
            for (i = 0; i < 9; i++) {
                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
                          (z[i] & maskt);
            }
            r->z[0] |= inf;
            r->infinity = (word32)inf;
        }
    }
}

#ifdef WOLFSSL_SP_NONBLOCK
typedef struct sp_256_proj_point_add_9_ctx {
    int state;
    sp_256_proj_point_dbl_9_ctx dbl_ctx;
    const sp_point_256* ap[2];
    sp_point_256* rp[2];
    sp_digit* t1;
    sp_digit* t2;
    sp_digit* t3;
    sp_digit* t4;
    sp_digit* t5;
    sp_digit* t6;
    sp_digit* x;
    sp_digit* y;
    sp_digit* z;
} sp_256_proj_point_add_9_ctx;

/* Add two Montgomery form projective points.
 *
 * r  Result of addition.
 * p  First point to add.
 * q  Second point to add.
 * t  Temporary ordinate data.
 */
static int sp_256_proj_point_add_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
    const sp_point_256* p, const sp_point_256* q, sp_digit* t)
{
    int err = FP_WOULDBLOCK;
    sp_256_proj_point_add_9_ctx* ctx = (sp_256_proj_point_add_9_ctx*)sp_ctx->data;

    /* Ensure only the first point is the same as the result. */
    if (q == r) {
        const sp_point_256* a = p;
        p = q;
        q = a;
    }

    typedef char ctx_size_test[sizeof(sp_256_proj_point_add_9_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
    (void)sizeof(ctx_size_test);

    switch (ctx->state) {
    case 0: /* INIT */
        ctx->t6 = t;
        ctx->t1 = t + 2*9;
        ctx->t2 = t + 4*9;
        ctx->t3 = t + 6*9;
        ctx->t4 = t + 8*9;
        ctx->t5 = t + 10*9;
        ctx->x = ctx->t6;
        ctx->y = ctx->t1;
        ctx->z = ctx->t2;

        ctx->state = 1;
        break;
    case 1:
        /* U1 = X1*Z2^2 */
        sp_256_mont_sqr_9(ctx->t1, q->z, p256_mod, p256_mp_mod);
        ctx->state = 2;
        break;
    case 2:
        sp_256_mont_mul_9(ctx->t3, ctx->t1, q->z, p256_mod, p256_mp_mod);
        ctx->state = 3;
        break;
    case 3:
        sp_256_mont_mul_9(ctx->t1, ctx->t1, p->x, p256_mod, p256_mp_mod);
        ctx->state = 4;
        break;
    case 4:
        /* U2 = X2*Z1^2 */
        sp_256_mont_sqr_9(ctx->t2, p->z, p256_mod, p256_mp_mod);
        ctx->state = 5;
        break;
    case 5:
        sp_256_mont_mul_9(ctx->t4, ctx->t2, p->z, p256_mod, p256_mp_mod);
        ctx->state = 6;
        break;
    case 6:
        sp_256_mont_mul_9(ctx->t2, ctx->t2, q->x, p256_mod, p256_mp_mod);
        ctx->state = 7;
        break;
    case 7:
        /* S1 = Y1*Z2^3 */
        sp_256_mont_mul_9(ctx->t3, ctx->t3, p->y, p256_mod, p256_mp_mod);
        ctx->state = 8;
        break;
    case 8:
        /* S2 = Y2*Z1^3 */
        sp_256_mont_mul_9(ctx->t4, ctx->t4, q->y, p256_mod, p256_mp_mod);
        ctx->state = 9;
        break;
    case 9:
        /* Check double */
        if ((~p->infinity) & (~q->infinity) &
                sp_256_cmp_equal_9(ctx->t2, ctx->t1) &
                sp_256_cmp_equal_9(ctx->t4, ctx->t3)) {
            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
            sp_256_proj_point_dbl_9(r, p, t);
            ctx->state = 25;
        }
        else {
            ctx->state = 10;
        }
        break;
    case 10:
        /* H = U2 - U1 */
        sp_256_mont_sub_9(ctx->t2, ctx->t2, ctx->t1, p256_mod);
        ctx->state = 11;
        break;
    case 11:
        /* R = S2 - S1 */
        sp_256_mont_sub_9(ctx->t4, ctx->t4, ctx->t3, p256_mod);
        ctx->state = 12;
        break;
    case 12:
        /* X3 = R^2 - H^3 - 2*U1*H^2 */
        sp_256_mont_sqr_9(ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
        ctx->state = 13;
        break;
    case 13:
        sp_256_mont_mul_9(ctx->y, ctx->t1, ctx->t5, p256_mod, p256_mp_mod);
        ctx->state = 14;
        break;
    case 14:
        sp_256_mont_mul_9(ctx->t5, ctx->t5, ctx->t2, p256_mod, p256_mp_mod);
        ctx->state = 15;
        break;
    case 15:
        /* Z3 = H*Z1*Z2 */
        sp_256_mont_mul_9(ctx->z, p->z, ctx->t2, p256_mod, p256_mp_mod);
        ctx->state = 16;
        break;
    case 16:
        sp_256_mont_mul_9(ctx->z, ctx->z, q->z, p256_mod, p256_mp_mod);
        ctx->state = 17;
        break;
    case 17:
        sp_256_mont_sqr_9(ctx->x, ctx->t4, p256_mod, p256_mp_mod);
        ctx->state = 18;
        break;
    case 18:
        sp_256_mont_sub_9(ctx->x, ctx->x, ctx->t5, p256_mod);
        ctx->state = 19;
        break;
    case 19:
        sp_256_mont_mul_9(ctx->t5, ctx->t5, ctx->t3, p256_mod, p256_mp_mod);
        ctx->state = 20;
        break;
    case 20:
        sp_256_mont_dbl_9(ctx->t3, ctx->y, p256_mod);
        sp_256_mont_sub_9(ctx->x, ctx->x, ctx->t3, p256_mod);
        ctx->state = 21;
        break;
    case 21:
        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
        sp_256_mont_sub_9(ctx->y, ctx->y, ctx->x, p256_mod);
        ctx->state = 22;
        break;
    case 22:
        sp_256_mont_mul_9(ctx->y, ctx->y, ctx->t4, p256_mod, p256_mp_mod);
        ctx->state = 23;
        break;
    case 23:
        sp_256_mont_sub_9(ctx->y, ctx->y, ctx->t5, p256_mod);
        ctx->state = 24;
        break;
    case 24:
    {
        {
            int i;
            sp_digit maskp = 0 - (q->infinity & (!p->infinity));
            sp_digit maskq = 0 - (p->infinity & (!q->infinity));
            sp_digit maskt = ~(maskp | maskq);
            sp_digit inf = (sp_digit)(p->infinity & q->infinity);

            for (i = 0; i < 9; i++) {
                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
                          (ctx->x[i] & maskt);
            }
            for (i = 0; i < 9; i++) {
                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
                          (ctx->y[i] & maskt);
            }
            for (i = 0; i < 9; i++) {
                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
                          (ctx->z[i] & maskt);
            }
            r->z[0] |= inf;
            r->infinity = (word32)inf;
        }
        ctx->state = 25;
        break;
    }
    case 25:
        err = MP_OKAY;
        break;
    }

    if (err == MP_OKAY && ctx->state != 25) {
        err = FP_WOULDBLOCK;
    }
    return err;
}
#endif /* WOLFSSL_SP_NONBLOCK */

/* Multiply a number by Montgomery normalizer mod modulus (prime).
 *
 * r  The resulting Montgomery form number.
 * a  The number to convert.
 * m  The modulus (prime).
 * returns MEMORY_E when memory allocation fails and MP_OKAY otherwise.
 */
static int sp_256_mod_mul_norm_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    int64_t* t = NULL;
#else
    int64_t t[2 * 8];
#endif
    int64_t* a32 = NULL;
    int64_t o;
    int err = MP_OKAY;

    (void)m;

#ifdef WOLFSSL_SP_SMALL_STACK
    t = (int64_t*)XMALLOC(sizeof(int64_t) * 2 * 8, NULL, DYNAMIC_TYPE_ECC);
    if (t == NULL)
        return MEMORY_E;
#endif

    if (err == MP_OKAY) {
        a32 = t + 8;

        a32[0] = a[0];
        a32[0] |= a[1] << 29U;
        a32[0] &= 0xffffffffL;
        a32[1] = (a[1] >> 3);
        a32[1] |= a[2] << 26U;
        a32[1] &= 0xffffffffL;
        a32[2] = (a[2] >> 6);
        a32[2] |= a[3] << 23U;
        a32[2] &= 0xffffffffL;
        a32[3] = (a[3] >> 9);
        a32[3] |= a[4] << 20U;
        a32[3] &= 0xffffffffL;
        a32[4] = (a[4] >> 12);
        a32[4] |= a[5] << 17U;
        a32[4] &= 0xffffffffL;
        a32[5] = (a[5] >> 15);
        a32[5] |= a[6] << 14U;
        a32[5] &= 0xffffffffL;
        a32[6] = (a[6] >> 18);
        a32[6] |= a[7] << 11U;
        a32[6] &= 0xffffffffL;
        a32[7] = (a[7] >> 21);
        a32[7] |= a[8] << 8U;
        a32[7] &= 0xffffffffL;

        /*  1  1  0 -1 -1 -1 -1  0 */
        t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6];
        /*  0  1  1  0 -1 -1 -1 -1 */
        t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7];
        /*  0  0  1  1  0 -1 -1 -1 */
        t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7];
        /* -1 -1  0  2  2  1  0 -1 */
        t[3] = 0 - a32[0] - a32[1] + 2 * a32[3] + 2 * a32[4] + a32[5] - a32[7];
        /*  0 -1 -1  0  2  2  1  0 */
        t[4] = 0 - a32[1] - a32[2] + 2 * a32[4] + 2 * a32[5] + a32[6];
        /*  0  0 -1 -1  0  2  2  1 */
        t[5] = 0 - a32[2] - a32[3] + 2 * a32[5] + 2 * a32[6] + a32[7];
        /* -1 -1  0  0  0  1  3  2 */
        t[6] = 0 - a32[0] - a32[1] + a32[5] + 3 * a32[6] + 2 * a32[7];
        /*  1  0 -1 -1 -1 -1  0  3 */
        t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3 * a32[7];

        t[1] += t[0] >> 32U; t[0] &= 0xffffffffL;
        t[2] += t[1] >> 32U; t[1] &= 0xffffffffL;
        t[3] += t[2] >> 32U; t[2] &= 0xffffffffL;
        t[4] += t[3] >> 32U; t[3] &= 0xffffffffL;
        t[5] += t[4] >> 32U; t[4] &= 0xffffffffL;
        t[6] += t[5] >> 32U; t[5] &= 0xffffffffL;
        t[7] += t[6] >> 32U; t[6] &= 0xffffffffL;
        o     = t[7] >> 32U; t[7] &= 0xffffffffL;
        t[0] += o;
        t[3] -= o;
        t[6] -= o;
        t[7] += o;
        t[1] += t[0] >> 32U; t[0] &= 0xffffffffL;
        t[2] += t[1] >> 32U; t[1] &= 0xffffffffL;
        t[3] += t[2] >> 32U; t[2] &= 0xffffffffL;
        t[4] += t[3] >> 32U; t[3] &= 0xffffffffL;
        t[5] += t[4] >> 32U; t[4] &= 0xffffffffL;
        t[6] += t[5] >> 32U; t[5] &= 0xffffffffL;
        t[7] += t[6] >> 32U; t[6] &= 0xffffffffL;

        r[0] = (sp_digit)(t[0]) & 0x1fffffffL;
        r[1] = (sp_digit)(t[0] >> 29U);
        r[1] |= (sp_digit)(t[1] << 3U);
        r[1] &= 0x1fffffffL;
        r[2] = (sp_digit)(t[1] >> 26U);
        r[2] |= (sp_digit)(t[2] << 6U);
        r[2] &= 0x1fffffffL;
        r[3] = (sp_digit)(t[2] >> 23U);
        r[3] |= (sp_digit)(t[3] << 9U);
        r[3] &= 0x1fffffffL;
        r[4] = (sp_digit)(t[3] >> 20U);
        r[4] |= (sp_digit)(t[4] << 12U);
        r[4] &= 0x1fffffffL;
        r[5] = (sp_digit)(t[4] >> 17U);
        r[5] |= (sp_digit)(t[5] << 15U);
        r[5] &= 0x1fffffffL;
        r[6] = (sp_digit)(t[5] >> 14U);
        r[6] |= (sp_digit)(t[6] << 18U);
        r[6] &= 0x1fffffffL;
        r[7] = (sp_digit)(t[6] >> 11U);
        r[7] |= (sp_digit)(t[7] << 21U);
        r[7] &= 0x1fffffffL;
        r[8] = (sp_digit)(t[7] >> 8U);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t != NULL)
        XFREE(t, NULL, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

#ifdef WOLFSSL_SP_SMALL
/* Multiply the point by the scalar and return the result.
 * If map is true then convert result to affine coordinates.
 *
 * Small implementation using add and double that is cache attack resistant but
 * allocates memory rather than use large stacks.
 * 256 adds and doubles.
 *
 * r     Resulting point.
 * g     Point to multiply.
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 * ct    Constant time required.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_256_ecc_mulmod_9(sp_point_256* r, const sp_point_256* g,
        const sp_digit* k, int map, int ct, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_256* t = NULL;
    sp_digit* tmp = NULL;
#else
    sp_point_256 t[3];
    sp_digit tmp[2 * 9 * 6];
#endif
    sp_digit n;
    int i;
    int c;
    int y;
    int err = MP_OKAY;

    /* Implementation is constant time. */
    (void)ct;
    (void)heap;

#ifdef WOLFSSL_SP_SMALL_STACK
    t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 3, heap,
                                     DYNAMIC_TYPE_ECC);
    if (t == NULL)
        err = MEMORY_E;
    if (err == MP_OKAY) {
        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, heap,
                                 DYNAMIC_TYPE_ECC);
        if (tmp == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        XMEMSET(t, 0, sizeof(sp_point_256) * 3);

        /* t[0] = {0, 0, 1} * norm */
        t[0].infinity = 1;
        /* t[1] = {g->x, g->y, g->z} * norm */
        err = sp_256_mod_mul_norm_9(t[1].x, g->x, p256_mod);
    }
    if (err == MP_OKAY)
        err = sp_256_mod_mul_norm_9(t[1].y, g->y, p256_mod);
    if (err == MP_OKAY)
        err = sp_256_mod_mul_norm_9(t[1].z, g->z, p256_mod);

    if (err == MP_OKAY) {
        i = 8;
        c = 24;
        n = k[i--] << (29 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = k[i--];
                c = 29;
            }

            y = (n >> 28) & 1;
            n <<= 1;

            sp_256_proj_point_add_9(&t[y^1], &t[0], &t[1], tmp);

            XMEMCPY(&t[2], (void*)(((size_t)&t[0] & addr_mask[y^1]) +
                                   ((size_t)&t[1] & addr_mask[y])),
                    sizeof(sp_point_256));
            sp_256_proj_point_dbl_9(&t[2], &t[2], tmp);
            XMEMCPY((void*)(((size_t)&t[0] & addr_mask[y^1]) +
                            ((size_t)&t[1] & addr_mask[y])), &t[2],
                    sizeof(sp_point_256));
        }

        if (map != 0) {
            sp_256_map_9(r, &t[0], tmp);
        }
        else {
            XMEMCPY(r, &t[0], sizeof(sp_point_256));
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (tmp != NULL)
#endif
    {
        ForceZero(tmp, sizeof(sp_digit) * 2 * 9 * 6);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(tmp, heap, DYNAMIC_TYPE_ECC);
    #endif
    }
#ifdef WOLFSSL_SP_SMALL_STACK
    if (t != NULL)
#endif
    {
        ForceZero(t, sizeof(sp_point_256) * 3);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(t, heap, DYNAMIC_TYPE_ECC);
    #endif
    }

    return err;
}

#ifdef WOLFSSL_SP_NONBLOCK
typedef struct sp_256_ecc_mulmod_9_ctx {
    int state;
    union {
        sp_256_proj_point_dbl_9_ctx dbl_ctx;
        sp_256_proj_point_add_9_ctx add_ctx;
    };
    sp_point_256 t[3];
    sp_digit tmp[2 * 9 * 6];
    sp_digit n;
    int i;
    int c;
    int y;
} sp_256_ecc_mulmod_9_ctx;

static int sp_256_ecc_mulmod_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
    const sp_point_256* g, const sp_digit* k, int map, int ct, void* heap)
{
    int err = FP_WOULDBLOCK;
    sp_256_ecc_mulmod_9_ctx* ctx = (sp_256_ecc_mulmod_9_ctx*)sp_ctx->data;

    typedef char ctx_size_test[sizeof(sp_256_ecc_mulmod_9_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
    (void)sizeof(ctx_size_test);

    /* Implementation is constant time. */
    (void)ct;

    switch (ctx->state) {
    case 0: /* INIT */
        XMEMSET(ctx->t, 0, sizeof(sp_point_256) * 3);
        ctx->i = 8;
        ctx->c = 24;
        ctx->n = k[ctx->i--] << (29 - ctx->c);

        /* t[0] = {0, 0, 1} * norm */
        ctx->t[0].infinity = 1;
        ctx->state = 1;
        break;
    case 1: /* T1X */
        /* t[1] = {g->x, g->y, g->z} * norm */
        err = sp_256_mod_mul_norm_9(ctx->t[1].x, g->x, p256_mod);
        ctx->state = 2;
        break;
    case 2: /* T1Y */
        err = sp_256_mod_mul_norm_9(ctx->t[1].y, g->y, p256_mod);
        ctx->state = 3;
        break;
    case 3: /* T1Z */
        err = sp_256_mod_mul_norm_9(ctx->t[1].z, g->z, p256_mod);
        ctx->state = 4;
        break;
    case 4: /* ADDPREP */
        if (ctx->c == 0) {
            if (ctx->i == -1) {
                ctx->state = 7;
                break;
            }

            ctx->n = k[ctx->i--];
            ctx->c = 29;
        }
        ctx->y = (ctx->n >> 28) & 1;
        ctx->n <<= 1;
        XMEMSET(&ctx->add_ctx, 0, sizeof(ctx->add_ctx));
        ctx->state = 5;
        break;
    case 5: /* ADD */
        err = sp_256_proj_point_add_9_nb((sp_ecc_ctx_t*)&ctx->add_ctx,
            &ctx->t[ctx->y^1], &ctx->t[0], &ctx->t[1], ctx->tmp);
        if (err == MP_OKAY) {
            XMEMCPY(&ctx->t[2], (void*)(((size_t)&ctx->t[0] & addr_mask[ctx->y^1]) +
                                        ((size_t)&ctx->t[1] & addr_mask[ctx->y])),
                    sizeof(sp_point_256));
            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
            ctx->state = 6;
        }
        break;
    case 6: /* DBL */
        err = sp_256_proj_point_dbl_9_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, &ctx->t[2],
            &ctx->t[2], ctx->tmp);
        if (err == MP_OKAY) {
            XMEMCPY((void*)(((size_t)&ctx->t[0] & addr_mask[ctx->y^1]) +
                            ((size_t)&ctx->t[1] & addr_mask[ctx->y])), &ctx->t[2],
                    sizeof(sp_point_256));
            ctx->state = 4;
            ctx->c--;
        }
        break;
    case 7: /* MAP */
        if (map != 0) {
            sp_256_map_9(r, &ctx->t[0], ctx->tmp);
        }
        else {
            XMEMCPY(r, &ctx->t[0], sizeof(sp_point_256));
        }
        err = MP_OKAY;
        break;
    }

    if (err == MP_OKAY && ctx->state != 7) {
        err = FP_WOULDBLOCK;
    }
    if (err != FP_WOULDBLOCK) {
        ForceZero(ctx->tmp, sizeof(ctx->tmp));
        ForceZero(ctx->t, sizeof(ctx->t));
    }

    (void)heap;

    return err;
}

#endif /* WOLFSSL_SP_NONBLOCK */

#else
/* A table entry for pre-computed points. */
typedef struct sp_table_entry_256 {
    sp_digit x[9];
    sp_digit y[9];
} sp_table_entry_256;

/* Conditionally copy a into r using the mask m.
 * m is -1 to copy and 0 when not.
 *
 * r  A single precision number to copy over.
 * a  A single precision number to copy.
 * m  Mask value to apply.
 */
static void sp_256_cond_copy_9(sp_digit* r, const sp_digit* a, const sp_digit m)
{
    sp_digit t[9];
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 9; i++) {
        t[i] = r[i] ^ a[i];
    }
    for (i = 0; i < 9; i++) {
        r[i] ^= t[i] & m;
    }
#else
    t[ 0] = r[ 0] ^ a[ 0];
    t[ 1] = r[ 1] ^ a[ 1];
    t[ 2] = r[ 2] ^ a[ 2];
    t[ 3] = r[ 3] ^ a[ 3];
    t[ 4] = r[ 4] ^ a[ 4];
    t[ 5] = r[ 5] ^ a[ 5];
    t[ 6] = r[ 6] ^ a[ 6];
    t[ 7] = r[ 7] ^ a[ 7];
    t[ 8] = r[ 8] ^ a[ 8];
    r[ 0] ^= t[ 0] & m;
    r[ 1] ^= t[ 1] & m;
    r[ 2] ^= t[ 2] & m;
    r[ 3] ^= t[ 3] & m;
    r[ 4] ^= t[ 4] & m;
    r[ 5] ^= t[ 5] & m;
    r[ 6] ^= t[ 6] & m;
    r[ 7] ^= t[ 7] & m;
    r[ 8] ^= t[ 8] & m;
#endif /* WOLFSSL_SP_SMALL */
}

/* Double the Montgomery form projective point p a number of times.
 *
 * r  Result of repeated doubling of point.
 * p  Point to double.
 * n  Number of times to double
 * t  Temporary ordinate data.
 */
static void sp_256_proj_point_dbl_n_9(sp_point_256* p, int i,
    sp_digit* t)
{
    sp_digit* w = t;
    sp_digit* a = t + 2*9;
    sp_digit* b = t + 4*9;
    sp_digit* t1 = t + 6*9;
    sp_digit* t2 = t + 8*9;
    sp_digit* x;
    sp_digit* y;
    sp_digit* z;
    volatile int n = i;

    x = p->x;
    y = p->y;
    z = p->z;

    /* Y = 2*Y */
    sp_256_mont_dbl_9(y, y, p256_mod);
    /* W = Z^4 */
    sp_256_mont_sqr_9(w, z, p256_mod, p256_mp_mod);
    sp_256_mont_sqr_9(w, w, p256_mod, p256_mp_mod);
#ifndef WOLFSSL_SP_SMALL
    while (--n > 0)
#else
    while (--n >= 0)
#endif
    {
        /* A = 3*(X^2 - W) */
        sp_256_mont_sqr_9(t1, x, p256_mod, p256_mp_mod);
        sp_256_mont_sub_9(t1, t1, w, p256_mod);
        sp_256_mont_tpl_9(a, t1, p256_mod);
        /* B = X*Y^2 */
        sp_256_mont_sqr_9(t1, y, p256_mod, p256_mp_mod);
        sp_256_mont_mul_9(b, t1, x, p256_mod, p256_mp_mod);
        /* X = A^2 - 2B */
        sp_256_mont_sqr_9(x, a, p256_mod, p256_mp_mod);
        sp_256_mont_dbl_9(t2, b, p256_mod);
        sp_256_mont_sub_9(x, x, t2, p256_mod);
        /* B = 2.(B - X) */
        sp_256_mont_sub_9(t2, b, x, p256_mod);
        sp_256_mont_dbl_9(b, t2, p256_mod);
        /* Z = Z*Y */
        sp_256_mont_mul_9(z, z, y, p256_mod, p256_mp_mod);
        /* t1 = Y^4 */
        sp_256_mont_sqr_9(t1, t1, p256_mod, p256_mp_mod);
#ifdef WOLFSSL_SP_SMALL
        if (n != 0)
#endif
        {
            /* W = W*Y^4 */
            sp_256_mont_mul_9(w, w, t1, p256_mod, p256_mp_mod);
        }
        /* y = 2*A*(B - X) - Y^4 */
        sp_256_mont_mul_9(y, b, a, p256_mod, p256_mp_mod);
        sp_256_mont_sub_9(y, y, t1, p256_mod);
    }
#ifndef WOLFSSL_SP_SMALL
    /* A = 3*(X^2 - W) */
    sp_256_mont_sqr_9(t1, x, p256_mod, p256_mp_mod);
    sp_256_mont_sub_9(t1, t1, w, p256_mod);
    sp_256_mont_tpl_9(a, t1, p256_mod);
    /* B = X*Y^2 */
    sp_256_mont_sqr_9(t1, y, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(b, t1, x, p256_mod, p256_mp_mod);
    /* X = A^2 - 2B */
    sp_256_mont_sqr_9(x, a, p256_mod, p256_mp_mod);
    sp_256_mont_dbl_9(t2, b, p256_mod);
    sp_256_mont_sub_9(x, x, t2, p256_mod);
    /* B = 2.(B - X) */
    sp_256_mont_sub_9(t2, b, x, p256_mod);
    sp_256_mont_dbl_9(b, t2, p256_mod);
    /* Z = Z*Y */
    sp_256_mont_mul_9(z, z, y, p256_mod, p256_mp_mod);
    /* t1 = Y^4 */
    sp_256_mont_sqr_9(t1, t1, p256_mod, p256_mp_mod);
    /* y = 2*A*(B - X) - Y^4 */
    sp_256_mont_mul_9(y, b, a, p256_mod, p256_mp_mod);
    sp_256_mont_sub_9(y, y, t1, p256_mod);
#endif /* WOLFSSL_SP_SMALL */
    /* Y = Y/2 */
    sp_256_mont_div2_9(y, y, p256_mod);
}

/* Double the Montgomery form projective point p a number of times.
 *
 * r  Result of repeated doubling of point.
 * p  Point to double.
 * n  Number of times to double
 * t  Temporary ordinate data.
 */
static void sp_256_proj_point_dbl_n_store_9(sp_point_256* r,
        const sp_point_256* p, int n, int m, sp_digit* t)
{
    sp_digit* w = t;
    sp_digit* a = t + 2*9;
    sp_digit* b = t + 4*9;
    sp_digit* t1 = t + 6*9;
    sp_digit* t2 = t + 8*9;
    sp_digit* x = r[2*m].x;
    sp_digit* y = r[(1<<n)*m].y;
    sp_digit* z = r[2*m].z;
    int i;
    int j;

    for (i=0; i<9; i++) {
        x[i] = p->x[i];
    }
    for (i=0; i<9; i++) {
        y[i] = p->y[i];
    }
    for (i=0; i<9; i++) {
        z[i] = p->z[i];
    }

    /* Y = 2*Y */
    sp_256_mont_dbl_9(y, y, p256_mod);
    /* W = Z^4 */
    sp_256_mont_sqr_9(w, z, p256_mod, p256_mp_mod);
    sp_256_mont_sqr_9(w, w, p256_mod, p256_mp_mod);
    j = m;
    for (i=1; i<=n; i++) {
        j *= 2;

        /* A = 3*(X^2 - W) */
        sp_256_mont_sqr_9(t1, x, p256_mod, p256_mp_mod);
        sp_256_mont_sub_9(t1, t1, w, p256_mod);
        sp_256_mont_tpl_9(a, t1, p256_mod);
        /* B = X*Y^2 */
        sp_256_mont_sqr_9(t1, y, p256_mod, p256_mp_mod);
        sp_256_mont_mul_9(b, t1, x, p256_mod, p256_mp_mod);
        x = r[j].x;
        /* X = A^2 - 2B */
        sp_256_mont_sqr_9(x, a, p256_mod, p256_mp_mod);
        sp_256_mont_dbl_9(t2, b, p256_mod);
        sp_256_mont_sub_9(x, x, t2, p256_mod);
        /* B = 2.(B - X) */
        sp_256_mont_sub_9(t2, b, x, p256_mod);
        sp_256_mont_dbl_9(b, t2, p256_mod);
        /* Z = Z*Y */
        sp_256_mont_mul_9(r[j].z, z, y, p256_mod, p256_mp_mod);
        z = r[j].z;
        /* t1 = Y^4 */
        sp_256_mont_sqr_9(t1, t1, p256_mod, p256_mp_mod);
        if (i != n) {
            /* W = W*Y^4 */
            sp_256_mont_mul_9(w, w, t1, p256_mod, p256_mp_mod);
        }
        /* y = 2*A*(B - X) - Y^4 */
        sp_256_mont_mul_9(y, b, a, p256_mod, p256_mp_mod);
        sp_256_mont_sub_9(y, y, t1, p256_mod);
        /* Y = Y/2 */
        sp_256_mont_div2_9(r[j].y, y, p256_mod);
        r[j].infinity = 0;
    }
}

/* Add two Montgomery form projective points.
 *
 * ra  Result of addition.
 * rs  Result of subtraction.
 * p   First point to add.
 * q   Second point to add.
 * t   Temporary ordinate data.
 */
static void sp_256_proj_point_add_sub_9(sp_point_256* ra,
        sp_point_256* rs, const sp_point_256* p, const sp_point_256* q,
        sp_digit* t)
{
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2*9;
    sp_digit* t3 = t + 4*9;
    sp_digit* t4 = t + 6*9;
    sp_digit* t5 = t + 8*9;
    sp_digit* t6 = t + 10*9;
    sp_digit* xa = ra->x;
    sp_digit* ya = ra->y;
    sp_digit* za = ra->z;
    sp_digit* xs = rs->x;
    sp_digit* ys = rs->y;
    sp_digit* zs = rs->z;


    XMEMCPY(xa, p->x, sizeof(p->x) / 2);
    XMEMCPY(ya, p->y, sizeof(p->y) / 2);
    XMEMCPY(za, p->z, sizeof(p->z) / 2);
    ra->infinity = 0;
    rs->infinity = 0;

    /* U1 = X1*Z2^2 */
    sp_256_mont_sqr_9(t1, q->z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(t3, t1, q->z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(t1, t1, xa, p256_mod, p256_mp_mod);
    /* U2 = X2*Z1^2 */
    sp_256_mont_sqr_9(t2, za, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(t4, t2, za, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(t2, t2, q->x, p256_mod, p256_mp_mod);
    /* S1 = Y1*Z2^3 */
    sp_256_mont_mul_9(t3, t3, ya, p256_mod, p256_mp_mod);
    /* S2 = Y2*Z1^3 */
    sp_256_mont_mul_9(t4, t4, q->y, p256_mod, p256_mp_mod);
    /* H = U2 - U1 */
    sp_256_mont_sub_9(t2, t2, t1, p256_mod);
    /* RS = S2 + S1 */
    sp_256_mont_add_9(t6, t4, t3, p256_mod);
    /* R = S2 - S1 */
    sp_256_mont_sub_9(t4, t4, t3, p256_mod);
    /* Z3 = H*Z1*Z2 */
    /* ZS = H*Z1*Z2 */
    sp_256_mont_mul_9(za, za, q->z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(za, za, t2, p256_mod, p256_mp_mod);
    XMEMCPY(zs, za, sizeof(p->z)/2);
    /* X3 = R^2 - H^3 - 2*U1*H^2 */
    /* XS = RS^2 - H^3 - 2*U1*H^2 */
    sp_256_mont_sqr_9(xa, t4, p256_mod, p256_mp_mod);
    sp_256_mont_sqr_9(xs, t6, p256_mod, p256_mp_mod);
    sp_256_mont_sqr_9(t5, t2, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(ya, t1, t5, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(t5, t5, t2, p256_mod, p256_mp_mod);
    sp_256_mont_sub_9(xa, xa, t5, p256_mod);
    sp_256_mont_sub_9(xs, xs, t5, p256_mod);
    sp_256_mont_dbl_9(t1, ya, p256_mod);
    sp_256_mont_sub_9(xa, xa, t1, p256_mod);
    sp_256_mont_sub_9(xs, xs, t1, p256_mod);
    /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
    /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */
    sp_256_mont_sub_9(ys, ya, xs, p256_mod);
    sp_256_mont_sub_9(ya, ya, xa, p256_mod);
    sp_256_mont_mul_9(ya, ya, t4, p256_mod, p256_mp_mod);
    sp_256_sub_9(t6, p256_mod, t6);
    sp_256_mont_mul_9(ys, ys, t6, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(t5, t5, t3, p256_mod, p256_mp_mod);
    sp_256_mont_sub_9(ya, ya, t5, p256_mod);
    sp_256_mont_sub_9(ys, ys, t5, p256_mod);
}

/* Structure used to describe recoding of scalar multiplication. */
typedef struct ecc_recode_256 {
    /* Index into pre-computation table. */
    uint8_t i;
    /* Use the negative of the point. */
    uint8_t neg;
} ecc_recode_256;

/* The index into pre-computation table to use. */
static const uint8_t recode_index_9_6[66] = {
     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
    32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
    16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
     0,  1,
};

/* Whether to negate y-ordinate. */
static const uint8_t recode_neg_9_6[66] = {
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
     0,  0,
};

/* Recode the scalar for multiplication using pre-computed values and
 * subtraction.
 *
 * k  Scalar to multiply by.
 * v  Vector of operations to perform.
 */
static void sp_256_ecc_recode_6_9(const sp_digit* k, ecc_recode_256* v)
{
    int i;
    int j;
    uint8_t y;
    int carry = 0;
    int o;
    sp_digit n;

    j = 0;
    n = k[j];
    o = 0;
    for (i=0; i<43; i++) {
        y = (int8_t)n;
        if (o + 6 < 29) {
            y &= 0x3f;
            n >>= 6;
            o += 6;
        }
        else if (o + 6 == 29) {
            n >>= 6;
            if (++j < 9)
                n = k[j];
            o = 0;
        }
        else if (++j < 9) {
            n = k[j];
            y |= (uint8_t)((n << (29 - o)) & 0x3f);
            o -= 23;
            n >>= o;
        }

        y += (uint8_t)carry;
        v[i].i = recode_index_9_6[y];
        v[i].neg = recode_neg_9_6[y];
        carry = (y >> 6) + v[i].neg;
    }
}

#ifndef WC_NO_CACHE_RESISTANT
/* Touch each possible point that could be being copied.
 *
 * r      Point to copy into.
 * table  Table - start of the entries to access
 * idx    Index of entry to retrieve.
 */
static void sp_256_get_point_33_9(sp_point_256* r, const sp_point_256* table,
    int idx)
{
    int i;
    sp_digit mask;

    r->x[0] = 0;
    r->x[1] = 0;
    r->x[2] = 0;
    r->x[3] = 0;
    r->x[4] = 0;
    r->x[5] = 0;
    r->x[6] = 0;
    r->x[7] = 0;
    r->x[8] = 0;
    r->y[0] = 0;
    r->y[1] = 0;
    r->y[2] = 0;
    r->y[3] = 0;
    r->y[4] = 0;
    r->y[5] = 0;
    r->y[6] = 0;
    r->y[7] = 0;
    r->y[8] = 0;
    r->z[0] = 0;
    r->z[1] = 0;
    r->z[2] = 0;
    r->z[3] = 0;
    r->z[4] = 0;
    r->z[5] = 0;
    r->z[6] = 0;
    r->z[7] = 0;
    r->z[8] = 0;
    for (i = 1; i < 33; i++) {
        mask = 0 - (i == idx);
        r->x[0] |= mask & table[i].x[0];
        r->x[1] |= mask & table[i].x[1];
        r->x[2] |= mask & table[i].x[2];
        r->x[3] |= mask & table[i].x[3];
        r->x[4] |= mask & table[i].x[4];
        r->x[5] |= mask & table[i].x[5];
        r->x[6] |= mask & table[i].x[6];
        r->x[7] |= mask & table[i].x[7];
        r->x[8] |= mask & table[i].x[8];
        r->y[0] |= mask & table[i].y[0];
        r->y[1] |= mask & table[i].y[1];
        r->y[2] |= mask & table[i].y[2];
        r->y[3] |= mask & table[i].y[3];
        r->y[4] |= mask & table[i].y[4];
        r->y[5] |= mask & table[i].y[5];
        r->y[6] |= mask & table[i].y[6];
        r->y[7] |= mask & table[i].y[7];
        r->y[8] |= mask & table[i].y[8];
        r->z[0] |= mask & table[i].z[0];
        r->z[1] |= mask & table[i].z[1];
        r->z[2] |= mask & table[i].z[2];
        r->z[3] |= mask & table[i].z[3];
        r->z[4] |= mask & table[i].z[4];
        r->z[5] |= mask & table[i].z[5];
        r->z[6] |= mask & table[i].z[6];
        r->z[7] |= mask & table[i].z[7];
        r->z[8] |= mask & table[i].z[8];
    }
}
#endif /* !WC_NO_CACHE_RESISTANT */
/* Multiply the point by the scalar and return the result.
 * If map is true then convert result to affine coordinates.
 *
 * Window technique of 6 bits. (Add-Sub variation.)
 * Calculate 0..32 times the point. Use function that adds and
 * subtracts the same two points.
 * Recode to add or subtract one of the computed points.
 * Double to push up.
 * NOT a sliding window.
 *
 * r     Resulting point.
 * g     Point to multiply.
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 * ct    Constant time required.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_256_ecc_mulmod_win_add_sub_9(sp_point_256* r, const sp_point_256* g,
        const sp_digit* k, int map, int ct, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_256* t = NULL;
    sp_digit* tmp = NULL;
#else
    sp_point_256 t[33+2];
    sp_digit tmp[2 * 9 * 6];
#endif
    sp_point_256* rt = NULL;
    sp_point_256* p = NULL;
    sp_digit* negy;
    int i;
    ecc_recode_256 v[43];
    int err = MP_OKAY;

    /* Constant time used for cache attack resistance implementation. */
    (void)ct;
    (void)heap;

#ifdef WOLFSSL_SP_SMALL_STACK
    t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) *
        (33+2), heap, DYNAMIC_TYPE_ECC);
    if (t == NULL)
        err = MEMORY_E;
    if (err == MP_OKAY) {
        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6,
                                 heap, DYNAMIC_TYPE_ECC);
        if (tmp == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        rt = t + 33;
        p  = t + 33+1;

        /* t[0] = {0, 0, 1} * norm */
        XMEMSET(&t[0], 0, sizeof(t[0]));
        t[0].infinity = 1;
        /* t[1] = {g->x, g->y, g->z} * norm */
        err = sp_256_mod_mul_norm_9(t[1].x, g->x, p256_mod);
    }
    if (err == MP_OKAY) {
        err = sp_256_mod_mul_norm_9(t[1].y, g->y, p256_mod);
    }
    if (err == MP_OKAY) {
        err = sp_256_mod_mul_norm_9(t[1].z, g->z, p256_mod);
    }

    if (err == MP_OKAY) {
        t[1].infinity = 0;
        /* t[2] ... t[32]  */
        sp_256_proj_point_dbl_n_store_9(t, &t[ 1], 5, 1, tmp);
        sp_256_proj_point_add_9(&t[ 3], &t[ 2], &t[ 1], tmp);
        sp_256_proj_point_dbl_9(&t[ 6], &t[ 3], tmp);
        sp_256_proj_point_add_sub_9(&t[ 7], &t[ 5], &t[ 6], &t[ 1], tmp);
        sp_256_proj_point_dbl_9(&t[10], &t[ 5], tmp);
        sp_256_proj_point_add_sub_9(&t[11], &t[ 9], &t[10], &t[ 1], tmp);
        sp_256_proj_point_dbl_9(&t[12], &t[ 6], tmp);
        sp_256_proj_point_dbl_9(&t[14], &t[ 7], tmp);
        sp_256_proj_point_add_sub_9(&t[15], &t[13], &t[14], &t[ 1], tmp);
        sp_256_proj_point_dbl_9(&t[18], &t[ 9], tmp);
        sp_256_proj_point_add_sub_9(&t[19], &t[17], &t[18], &t[ 1], tmp);
        sp_256_proj_point_dbl_9(&t[20], &t[10], tmp);
        sp_256_proj_point_dbl_9(&t[22], &t[11], tmp);
        sp_256_proj_point_add_sub_9(&t[23], &t[21], &t[22], &t[ 1], tmp);
        sp_256_proj_point_dbl_9(&t[24], &t[12], tmp);
        sp_256_proj_point_dbl_9(&t[26], &t[13], tmp);
        sp_256_proj_point_add_sub_9(&t[27], &t[25], &t[26], &t[ 1], tmp);
        sp_256_proj_point_dbl_9(&t[28], &t[14], tmp);
        sp_256_proj_point_dbl_9(&t[30], &t[15], tmp);
        sp_256_proj_point_add_sub_9(&t[31], &t[29], &t[30], &t[ 1], tmp);

        negy = t[0].y;

        sp_256_ecc_recode_6_9(k, v);

        i = 42;
    #ifndef WC_NO_CACHE_RESISTANT
        if (ct) {
            sp_256_get_point_33_9(rt, t, v[i].i);
            rt->infinity = !v[i].i;
        }
        else
    #endif
        {
            XMEMCPY(rt, &t[v[i].i], sizeof(sp_point_256));
        }
        for (--i; i>=0; i--) {
            sp_256_proj_point_dbl_n_9(rt, 6, tmp);

        #ifndef WC_NO_CACHE_RESISTANT
            if (ct) {
                sp_256_get_point_33_9(p, t, v[i].i);
                p->infinity = !v[i].i;
            }
            else
        #endif
            {
                XMEMCPY(p, &t[v[i].i], sizeof(sp_point_256));
            }
            sp_256_sub_9(negy, p256_mod, p->y);
            sp_256_norm_9(negy);
            sp_256_cond_copy_9(p->y, negy, (sp_digit)0 - v[i].neg);
            sp_256_proj_point_add_9(rt, rt, p, tmp);
        }

        if (map != 0) {
            sp_256_map_9(r, rt, tmp);
        }
        else {
            XMEMCPY(r, rt, sizeof(sp_point_256));
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t != NULL)
        XFREE(t, heap, DYNAMIC_TYPE_ECC);
    if (tmp != NULL)
        XFREE(tmp, heap, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

#ifdef FP_ECC
#endif /* FP_ECC */
/* Add two Montgomery form projective points. The second point has a q value of
 * one.
 * Only the first point can be the same pointer as the result point.
 *
 * r  Result of addition.
 * p  First point to add.
 * q  Second point to add.
 * t  Temporary ordinate data.
 */
static void sp_256_proj_point_add_qz1_9(sp_point_256* r,
    const sp_point_256* p, const sp_point_256* q, sp_digit* t)
{
    sp_digit* t2 = t;
    sp_digit* t3 = t + 2*9;
    sp_digit* t6 = t + 4*9;
    sp_digit* t1 = t + 6*9;
    sp_digit* t4 = t + 8*9;
    sp_digit* t5 = t + 10*9;

    /* Calculate values to subtract from P->x and P->y. */
    /* U2 = X2*Z1^2 */
    sp_256_mont_sqr_9(t2, p->z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(t4, t2, p->z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(t2, t2, q->x, p256_mod, p256_mp_mod);
    /* S2 = Y2*Z1^3 */
    sp_256_mont_mul_9(t4, t4, q->y, p256_mod, p256_mp_mod);

    if ((~p->infinity) & (~q->infinity) &
            sp_256_cmp_equal_9(p->x, t2) &
            sp_256_cmp_equal_9(p->y, t4)) {
        sp_256_proj_point_dbl_9(r, p, t);
    }
    else {
        sp_digit* x = t2;
        sp_digit* y = t3;
        sp_digit* z = t6;

        /* H = U2 - X1 */
        sp_256_mont_sub_9(t2, t2, p->x, p256_mod);
        /* R = S2 - Y1 */
        sp_256_mont_sub_9(t4, t4, p->y, p256_mod);
        /* Z3 = H*Z1 */
        sp_256_mont_mul_9(z, p->z, t2, p256_mod, p256_mp_mod);
        /* X3 = R^2 - H^3 - 2*X1*H^2 */
        sp_256_mont_sqr_9(t1, t2, p256_mod, p256_mp_mod);
        sp_256_mont_mul_9(t3, p->x, t1, p256_mod, p256_mp_mod);
        sp_256_mont_mul_9(t1, t1, t2, p256_mod, p256_mp_mod);
        sp_256_mont_sqr_9(t2, t4, p256_mod, p256_mp_mod);
        sp_256_mont_sub_9(t2, t2, t1, p256_mod);
        sp_256_mont_dbl_9(t5, t3, p256_mod);
        sp_256_mont_sub_9(x, t2, t5, p256_mod);
        /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
        sp_256_mont_sub_9(t3, t3, x, p256_mod);
        sp_256_mont_mul_9(t3, t3, t4, p256_mod, p256_mp_mod);
        sp_256_mont_mul_9(t1, t1, p->y, p256_mod, p256_mp_mod);
        sp_256_mont_sub_9(y, t3, t1, p256_mod);
        {
            int i;
            sp_digit maskp = 0 - (q->infinity & (!p->infinity));
            sp_digit maskq = 0 - (p->infinity & (!q->infinity));
            sp_digit maskt = ~(maskp | maskq);
            sp_digit inf = (sp_digit)(p->infinity & q->infinity);

            for (i = 0; i < 9; i++) {
                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
                          (x[i] & maskt);
            }
            for (i = 0; i < 9; i++) {
                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
                          (y[i] & maskt);
            }
            for (i = 0; i < 9; i++) {
                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
                          (z[i] & maskt);
            }
            r->z[0] |= inf;
            r->infinity = (word32)inf;
        }
    }
}

#ifdef FP_ECC
/* Convert the projective point to affine.
 * Ordinates are in Montgomery form.
 *
 * a  Point to convert.
 * t  Temporary data.
 */
static void sp_256_proj_to_affine_9(sp_point_256* a, sp_digit* t)
{
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2 * 9;
    sp_digit* tmp = t + 4 * 9;

    sp_256_mont_inv_9(t1, a->z, tmp);

    sp_256_mont_sqr_9(t2, t1, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(t1, t2, t1, p256_mod, p256_mp_mod);

    sp_256_mont_mul_9(a->x, a->x, t2, p256_mod, p256_mp_mod);
    sp_256_mont_mul_9(a->y, a->y, t1, p256_mod, p256_mp_mod);
    XMEMCPY(a->z, p256_norm_mod, sizeof(p256_norm_mod));
}

/* Generate the pre-computed table of points for the base point.
 *
 * width = 8
 * 256 entries
 * 32 bits between
 *
 * a      The base point.
 * table  Place to store generated point data.
 * tmp    Temporary data.
 * heap  Heap to use for allocation.
 */
static int sp_256_gen_stripe_table_9(const sp_point_256* a,
        sp_table_entry_256* table, sp_digit* tmp, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_256* t = NULL;
#else
    sp_point_256 t[3];
#endif
    sp_point_256* s1 = NULL;
    sp_point_256* s2 = NULL;
    int i;
    int j;
    int err = MP_OKAY;

    (void)heap;

#ifdef WOLFSSL_SP_SMALL_STACK
    t = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 3, heap,
                                     DYNAMIC_TYPE_ECC);
    if (t == NULL)
        err = MEMORY_E;
#endif

    if (err == MP_OKAY) {
        s1 = t + 1;
        s2 = t + 2;

        err = sp_256_mod_mul_norm_9(t->x, a->x, p256_mod);
    }
    if (err == MP_OKAY) {
        err = sp_256_mod_mul_norm_9(t->y, a->y, p256_mod);
    }
    if (err == MP_OKAY) {
        err = sp_256_mod_mul_norm_9(t->z, a->z, p256_mod);
    }
    if (err == MP_OKAY) {
        t->infinity = 0;
        sp_256_proj_to_affine_9(t, tmp);

        XMEMCPY(s1->z, p256_norm_mod, sizeof(p256_norm_mod));
        s1->infinity = 0;
        XMEMCPY(s2->z, p256_norm_mod, sizeof(p256_norm_mod));
        s2->infinity = 0;

        /* table[0] = {0, 0, infinity} */
        XMEMSET(&table[0], 0, sizeof(sp_table_entry_256));
        /* table[1] = Affine version of 'a' in Montgomery form */
        XMEMCPY(table[1].x, t->x, sizeof(table->x));
        XMEMCPY(table[1].y, t->y, sizeof(table->y));

        for (i=1; i<8; i++) {
            sp_256_proj_point_dbl_n_9(t, 32, tmp);
            sp_256_proj_to_affine_9(t, tmp);
            XMEMCPY(table[1<<i].x, t->x, sizeof(table->x));
            XMEMCPY(table[1<<i].y, t->y, sizeof(table->y));
        }

        for (i=1; i<8; i++) {
            XMEMCPY(s1->x, table[1<<i].x, sizeof(table->x));
            XMEMCPY(s1->y, table[1<<i].y, sizeof(table->y));
            for (j=(1<<i)+1; j<(1<<(i+1)); j++) {
                XMEMCPY(s2->x, table[j-(1<<i)].x, sizeof(table->x));
                XMEMCPY(s2->y, table[j-(1<<i)].y, sizeof(table->y));
                sp_256_proj_point_add_qz1_9(t, s1, s2, tmp);
                sp_256_proj_to_affine_9(t, tmp);
                XMEMCPY(table[j].x, t->x, sizeof(table->x));
                XMEMCPY(table[j].y, t->y, sizeof(table->y));
            }
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t != NULL)
        XFREE(t, heap, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

#endif /* FP_ECC */
#ifndef WC_NO_CACHE_RESISTANT
/* Touch each possible entry that could be being copied.
 *
 * r      Point to copy into.
 * table  Table - start of the entries to access
 * idx    Index of entry to retrieve.
 */
static void sp_256_get_entry_256_9(sp_point_256* r,
    const sp_table_entry_256* table, int idx)
{
    int i;
    sp_digit mask;

    r->x[0] = 0;
    r->x[1] = 0;
    r->x[2] = 0;
    r->x[3] = 0;
    r->x[4] = 0;
    r->x[5] = 0;
    r->x[6] = 0;
    r->x[7] = 0;
    r->x[8] = 0;
    r->y[0] = 0;
    r->y[1] = 0;
    r->y[2] = 0;
    r->y[3] = 0;
    r->y[4] = 0;
    r->y[5] = 0;
    r->y[6] = 0;
    r->y[7] = 0;
    r->y[8] = 0;
    for (i = 1; i < 256; i++) {
        mask = 0 - (i == idx);
        r->x[0] |= mask & table[i].x[0];
        r->x[1] |= mask & table[i].x[1];
        r->x[2] |= mask & table[i].x[2];
        r->x[3] |= mask & table[i].x[3];
        r->x[4] |= mask & table[i].x[4];
        r->x[5] |= mask & table[i].x[5];
        r->x[6] |= mask & table[i].x[6];
        r->x[7] |= mask & table[i].x[7];
        r->x[8] |= mask & table[i].x[8];
        r->y[0] |= mask & table[i].y[0];
        r->y[1] |= mask & table[i].y[1];
        r->y[2] |= mask & table[i].y[2];
        r->y[3] |= mask & table[i].y[3];
        r->y[4] |= mask & table[i].y[4];
        r->y[5] |= mask & table[i].y[5];
        r->y[6] |= mask & table[i].y[6];
        r->y[7] |= mask & table[i].y[7];
        r->y[8] |= mask & table[i].y[8];
    }
}
#endif /* !WC_NO_CACHE_RESISTANT */
/* Multiply the point by the scalar and return the result.
 * If map is true then convert result to affine coordinates.
 *
 * Stripe implementation.
 * Pre-generated: 2^0, 2^32, ...
 * Pre-generated: products of all combinations of above.
 * 8 doubles and adds (with qz=1)
 *
 * r      Resulting point.
 * k      Scalar to multiply by.
 * table  Pre-computed table.
 * map    Indicates whether to convert result to affine.
 * ct     Constant time required.
 * heap   Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_256_ecc_mulmod_stripe_9(sp_point_256* r, const sp_point_256* g,
        const sp_table_entry_256* table, const sp_digit* k, int map,
        int ct, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_256* rt = NULL;
    sp_digit* t = NULL;
#else
    sp_point_256 rt[2];
    sp_digit t[2 * 9 * 6];
#endif
    sp_point_256* p = NULL;
    int i;
    int j;
    int y;
    int x;
    int err = MP_OKAY;

    (void)g;
    /* Constant time used for cache attack resistance implementation. */
    (void)ct;
    (void)heap;


#ifdef WOLFSSL_SP_SMALL_STACK
    rt = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap,
                                      DYNAMIC_TYPE_ECC);
    if (rt == NULL)
        err = MEMORY_E;
    if (err == MP_OKAY) {
        t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, heap,
                               DYNAMIC_TYPE_ECC);
        if (t == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        p = rt + 1;

        XMEMCPY(p->z, p256_norm_mod, sizeof(p256_norm_mod));
        XMEMCPY(rt->z, p256_norm_mod, sizeof(p256_norm_mod));

        y = 0;
        x = 31;
        for (j=0; j<8; j++) {
            y |= (int)(((k[x / 29] >> (x % 29)) & 1) << j);
            x += 32;
        }
    #ifndef WC_NO_CACHE_RESISTANT
        if (ct) {
            sp_256_get_entry_256_9(rt, table, y);
        } else
    #endif
        {
            XMEMCPY(rt->x, table[y].x, sizeof(table[y].x));
            XMEMCPY(rt->y, table[y].y, sizeof(table[y].y));
        }
        rt->infinity = !y;
        for (i=30; i>=0; i--) {
            y = 0;
            x = i;
            for (j=0; j<8; j++) {
                y |= (int)(((k[x / 29] >> (x % 29)) & 1) << j);
                x += 32;
            }

            sp_256_proj_point_dbl_9(rt, rt, t);
        #ifndef WC_NO_CACHE_RESISTANT
            if (ct) {
                sp_256_get_entry_256_9(p, table, y);
            }
            else
        #endif
            {
                XMEMCPY(p->x, table[y].x, sizeof(table[y].x));
                XMEMCPY(p->y, table[y].y, sizeof(table[y].y));
            }
            p->infinity = !y;
            sp_256_proj_point_add_qz1_9(rt, rt, p, t);
        }

        if (map != 0) {
            sp_256_map_9(r, rt, t);
        }
        else {
            XMEMCPY(r, rt, sizeof(sp_point_256));
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t != NULL)
        XFREE(t, heap, DYNAMIC_TYPE_ECC);
    if (rt != NULL)
        XFREE(rt, heap, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

#ifdef FP_ECC
#ifndef FP_ENTRIES
    #define FP_ENTRIES 16
#endif

/* Cache entry - holds precomputation tables for a point. */
typedef struct sp_cache_256_t {
    /* X ordinate of point that table was generated from. */
    sp_digit x[9];
    /* Y ordinate of point that table was generated from. */
    sp_digit y[9];
    /* Precomputation table for point. */
    sp_table_entry_256 table[256];
    /* Count of entries in table. */
    uint32_t cnt;
    /* Point and table set in entry. */
    int set;
} sp_cache_256_t;

/* Cache of tables. */
static THREAD_LS_T sp_cache_256_t sp_cache_256[FP_ENTRIES];
/* Index of last entry in cache. */
static THREAD_LS_T int sp_cache_256_last = -1;
/* Cache has been initialized. */
static THREAD_LS_T int sp_cache_256_inited = 0;

#ifndef HAVE_THREAD_LS
    #ifndef WOLFSSL_MUTEX_INITIALIZER
    static volatile int initCacheMutex_256 = 0;
    #endif
    static wolfSSL_Mutex sp_cache_256_lock WOLFSSL_MUTEX_INITIALIZER_CLAUSE(sp_cache_256_lock);
#endif

/* Get the cache entry for the point.
 *
 * g      [in]   Point scalar multiplying.
 * cache  [out]  Cache table to use.
 */
static void sp_ecc_get_cache_256(const sp_point_256* g, sp_cache_256_t** cache)
{
    int i;
    int j;
    uint32_t least;

    if (sp_cache_256_inited == 0) {
        for (i=0; i<FP_ENTRIES; i++) {
            sp_cache_256[i].set = 0;
        }
        sp_cache_256_inited = 1;
    }

    /* Compare point with those in cache. */
    for (i=0; i<FP_ENTRIES; i++) {
        if (!sp_cache_256[i].set)
            continue;

        if (sp_256_cmp_equal_9(g->x, sp_cache_256[i].x) &
                           sp_256_cmp_equal_9(g->y, sp_cache_256[i].y)) {
            sp_cache_256[i].cnt++;
            break;
        }
    }

    /* No match. */
    if (i == FP_ENTRIES) {
        /* Find empty entry. */
        i = (sp_cache_256_last + 1) % FP_ENTRIES;
        for (; i != sp_cache_256_last; i=(i+1)%FP_ENTRIES) {
            if (!sp_cache_256[i].set) {
                break;
            }
        }

        /* Evict least used. */
        if (i == sp_cache_256_last) {
            least = sp_cache_256[0].cnt;
            for (j=1; j<FP_ENTRIES; j++) {
                if (sp_cache_256[j].cnt < least) {
                    i = j;
                    least = sp_cache_256[i].cnt;
                }
            }
        }

        XMEMCPY(sp_cache_256[i].x, g->x, sizeof(sp_cache_256[i].x));
        XMEMCPY(sp_cache_256[i].y, g->y, sizeof(sp_cache_256[i].y));
        sp_cache_256[i].set = 1;
        sp_cache_256[i].cnt = 1;
    }

    *cache = &sp_cache_256[i];
    sp_cache_256_last = i;
}
#endif /* FP_ECC */

/* Multiply the base point of P256 by the scalar and return the result.
 * If map is true then convert result to affine coordinates.
 *
 * r     Resulting point.
 * g     Point to multiply.
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 * ct    Constant time required.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_256_ecc_mulmod_9(sp_point_256* r, const sp_point_256* g,
        const sp_digit* k, int map, int ct, void* heap)
{
#ifndef FP_ECC
    return sp_256_ecc_mulmod_win_add_sub_9(r, g, k, map, ct, heap);
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* tmp;
#else
    sp_digit tmp[2 * 9 * 6];
#endif
    sp_cache_256_t* cache;
    int err = MP_OKAY;

#ifdef WOLFSSL_SP_SMALL_STACK
    tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, heap, DYNAMIC_TYPE_ECC);
    if (tmp == NULL) {
        err = MEMORY_E;
    }
#endif
#ifndef HAVE_THREAD_LS
    if (err == MP_OKAY) {
        #ifndef WOLFSSL_MUTEX_INITIALIZER
        if (initCacheMutex_256 == 0) {
            wc_InitMutex(&sp_cache_256_lock);
            initCacheMutex_256 = 1;
        }
        #endif
        if (wc_LockMutex(&sp_cache_256_lock) != 0) {
            err = BAD_MUTEX_E;
        }
    }
#endif /* HAVE_THREAD_LS */

    if (err == MP_OKAY) {
        sp_ecc_get_cache_256(g, &cache);
        if (cache->cnt == 2)
            sp_256_gen_stripe_table_9(g, cache->table, tmp, heap);

#ifndef HAVE_THREAD_LS
        wc_UnLockMutex(&sp_cache_256_lock);
#endif /* HAVE_THREAD_LS */

        if (cache->cnt < 2) {
            err = sp_256_ecc_mulmod_win_add_sub_9(r, g, k, map, ct, heap);
        }
        else {
            err = sp_256_ecc_mulmod_stripe_9(r, g, cache->table, k,
                    map, ct, heap);
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(tmp, heap, DYNAMIC_TYPE_ECC);
#endif
    return err;
#endif
}

#endif
/* Multiply the point by the scalar and return the result.
 * If map is true then convert result to affine coordinates.
 *
 * km    Scalar to multiply by.
 * p     Point to multiply.
 * r     Resulting point.
 * map   Indicates whether to convert result to affine.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
int sp_ecc_mulmod_256(const mp_int* km, const ecc_point* gm, ecc_point* r,
        int map, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_256* point = NULL;
    sp_digit* k = NULL;
#else
    sp_point_256 point[1];
    sp_digit k[9];
#endif
    int err = MP_OKAY;

#ifdef WOLFSSL_SP_SMALL_STACK
    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap,
                                         DYNAMIC_TYPE_ECC);
    if (point == NULL)
        err = MEMORY_E;
    if (err == MP_OKAY) {
        k = (sp_digit*)XMALLOC(sizeof(sp_digit) * 9, heap,
                               DYNAMIC_TYPE_ECC);
        if (k == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        sp_256_from_mp(k, 9, km);
        sp_256_point_from_ecc_point_9(point, gm);

            err = sp_256_ecc_mulmod_9(point, point, k, map, 1, heap);
    }
    if (err == MP_OKAY) {
        err = sp_256_point_to_ecc_point_9(point, r);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (k != NULL)
        XFREE(k, heap, DYNAMIC_TYPE_ECC);
    if (point != NULL)
        XFREE(point, heap, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

/* Multiply the point by the scalar, add point a and return the result.
 * If map is true then convert result to affine coordinates.
 *
 * km      Scalar to multiply by.
 * p       Point to multiply.
 * am      Point to add to scalar multiply result.
 * inMont  Point to add is in montgomery form.
 * r       Resulting point.
 * map     Indicates whether to convert result to affine.
 * heap    Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
int sp_ecc_mulmod_add_256(const mp_int* km, const ecc_point* gm,
    const ecc_point* am, int inMont, ecc_point* r, int map, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_256* point = NULL;
    sp_digit* k = NULL;
#else
    sp_point_256 point[2];
    sp_digit k[9 + 9 * 2 * 6];
#endif
    sp_point_256* addP = NULL;
    sp_digit* tmp = NULL;
    int err = MP_OKAY;

#ifdef WOLFSSL_SP_SMALL_STACK
    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap,
                                         DYNAMIC_TYPE_ECC);
    if (point == NULL)
        err = MEMORY_E;
    if (err == MP_OKAY) {
        k = (sp_digit*)XMALLOC(
            sizeof(sp_digit) * (9 + 9 * 2 * 6), heap,
            DYNAMIC_TYPE_ECC);
        if (k == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        addP = point + 1;
        tmp = k + 9;

        sp_256_from_mp(k, 9, km);
        sp_256_point_from_ecc_point_9(point, gm);
        sp_256_point_from_ecc_point_9(addP, am);
    }
    if ((err == MP_OKAY) && (!inMont)) {
        err = sp_256_mod_mul_norm_9(addP->x, addP->x, p256_mod);
    }
    if ((err == MP_OKAY) && (!inMont)) {
        err = sp_256_mod_mul_norm_9(addP->y, addP->y, p256_mod);
    }
    if ((err == MP_OKAY) && (!inMont)) {
        err = sp_256_mod_mul_norm_9(addP->z, addP->z, p256_mod);
    }
    if (err == MP_OKAY) {
            err = sp_256_ecc_mulmod_9(point, point, k, 0, 0, heap);
    }
    if (err == MP_OKAY) {
            sp_256_proj_point_add_9(point, point, addP, tmp);

        if (map) {
                sp_256_map_9(point, point, tmp);
        }

        err = sp_256_point_to_ecc_point_9(point, r);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (k != NULL)
        XFREE(k, heap, DYNAMIC_TYPE_ECC);
    if (point != NULL)
        XFREE(point, heap, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

#ifdef WOLFSSL_SP_SMALL
/* Multiply the base point of P256 by the scalar and return the result.
 * If map is true then convert result to affine coordinates.
 *
 * r     Resulting point.
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_256_ecc_mulmod_base_9(sp_point_256* r, const sp_digit* k,
        int map, int ct, void* heap)
{
    /* No pre-computed values. */
    return sp_256_ecc_mulmod_9(r, &p256_base, k, map, ct, heap);
}

#ifdef WOLFSSL_SP_NONBLOCK
static int sp_256_ecc_mulmod_base_9_nb(sp_ecc_ctx_t* sp_ctx, sp_point_256* r,
        const sp_digit* k, int map, int ct, void* heap)
{
    /* No pre-computed values. */
    return sp_256_ecc_mulmod_9_nb(sp_ctx, r, &p256_base, k, map, ct, heap);
}
#endif /* WOLFSSL_SP_NONBLOCK */


#else
/* Striping precomputation table.
 * 8 points combined into a table of 256 points.
 * Distance of 32 between points.
 */
static const sp_table_entry_256 p256_table[256] = {
    /* 0 */
    { { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
      { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } },
    /* 1 */
    { { 0x18a9143c,0x0f3986a0,0x1b6d805e,0x152bf8bf,0x0251075b,0x1995bbb1,
        0x1719e7ed,0x0ed4a6ea,0x0018905f },
      { 0x0e95560a,0x0f929abe,0x06791737,0x1571c974,0x1f3258b4,0x03446e90,
        0x16174ba2,0x0304b10b,0x008571ff } },
    /* 2 */
    { { 0x0147519a,0x01443012,0x0cdcbc08,0x103d584d,0x1ebc8d09,0x13e553c2,
        0x03a6a752,0x01bb7beb,0x00d953c5 },
      { 0x1d590f8f,0x0b1b0e67,0x19b245e7,0x12c4d689,0x164cf72e,0x10881175,
        0x03cdff65,0x0fd3d651,0x00863ebb } },
    /* 3 */
    { { 0x1cdb6485,0x02b5b11a,0x028be5de,0x1e1d445e,0x0300b808,0x0caa27bf,
        0x0280f9a3,0x0ab6bff0,0x00000760 },
      { 0x038d2010,0x11a75cdc,0x10dc229d,0x029f7664,0x06606540,0x1e9cc215,
        0x1b838391,0x0c2686e7,0x00830877 } },
    /* 4 */
    { { 0x16a0d2bb,0x1c917e28,0x188d2653,0x1982d834,0x02c8b0d5,0x079d2be3,
        0x19fe4907,0x0c3fa36c,0x002f5e69 },
      { 0x15a01797,0x00ae385f,0x05586497,0x01689ac1,0x1db523d2,0x0d9b838f,
        0x1dec1244,0x02d1ade1,0x00f648f9 } },
    /* 5 */
    { { 0x0137bbbc,0x12b3423f,0x1a82fb27,0x088d3d14,0x13463e43,0x13b0bceb,
        0x0056c710,0x10a267a0,0x005abe02 },
      { 0x004c7dab,0x15541be6,0x098301e4,0x1b3e9886,0x0cc37573,0x0ab13c73,
        0x0e0c324c,0x0b6d6dee,0x0094bb72 } },
    /* 6 */
    { { 0x120f141c,0x1fcda47b,0x1d6f1d2e,0x13679a5b,0x045c4619,0x1094a088,
        0x13bf70fd,0x1965efb8,0x00cdd6bb },
      { 0x0af436fd,0x0533805f,0x04c9afb3,0x08fedb73,0x125226f6,0x13c900a7,
        0x17d8303e,0x17a97b5c,0x00a361be } },
    /* 7 */
    { { 0x197c13c7,0x05512ac2,0x0df0f84a,0x1ac6bea1,0x09d1dc38,0x0d7679e0,
        0x04b01c0e,0x013896a5,0x00ba12ca },
      { 0x19f91dfd,0x12047d22,0x1a81fee7,0x0876cd9d,0x00b293af,0x1844cebc,
        0x1d2c7b3a,0x13ae03fd,0x0053ebb9 } },
    /* 8 */
    { { 0x10e63d34,0x1f3f718d,0x1953ead3,0x000ae553,0x1b5a4f46,0x199a6af3,
        0x00c70124,0x1240daa9,0x008589fb },
      { 0x0583553a,0x1387ae63,0x1592796a,0x121295c4,0x04652087,0x02838802,
        0x113f3241,0x0da04a83,0x00ebb069 } },
    /* 9 */
    { { 0x0c1647c5,0x10b650ad,0x13d5e651,0x04fa8f89,0x1fbacb81,0x1551bb26,
        0x168f7199,0x197a364f,0x00eb2820 },
      { 0x0a87e008,0x0037c6c3,0x08de3ce5,0x1bf53b24,0x0ecb2d87,0x17214066,
        0x08755bb4,0x136ab4fb,0x001f2828 } },
    /* 10 */
    { { 0x1b89da99,0x1dd50601,0x0a1008aa,0x05af3d70,0x005e8a6f,0x1c315c0e,
        0x158c9e11,0x0b20bca9,0x00337a4b },
      { 0x01f7794a,0x033a8069,0x1b5fd84f,0x000b6efa,0x1d6e8207,0x1bc08267,
        0x0f582968,0x1abe985f,0x000d65e0 } },
    /* 11 */
    { { 0x15275d38,0x0e84ddf5,0x1828d636,0x114e8a17,0x0b265426,0x17fa4b9f,
        0x08cbc1d8,0x084a5e94,0x00c23da2 },
      { 0x0b94520c,0x0d0dc278,0x16f5e397,0x0ccec760,0x09ea1096,0x05c34a69,
        0x1fc4e937,0x1198f219,0x0019de3b } },
    /* 12 */
    { { 0x06c5fe04,0x01d38b61,0x0e86f6c6,0x11bc1677,0x1712c3b2,0x02c35265,
        0x0ff5d0cb,0x1a923f99,0x00e34dcb },
      { 0x0aa58403,0x0046a35d,0x1a5e94ed,0x12e90d05,0x0a8af9a6,0x00939b55,
        0x1dfe78e4,0x088f69c1,0x00e7641f } },
    /* 13 */
    { { 0x1f64ba59,0x0ba9ca0e,0x0090bf1f,0x1e21d816,0x01859d33,0x0fe350ac,
        0x1efd3c1b,0x0ae0a54a,0x004a12df },
      { 0x1439dbd0,0x1d319c7c,0x194f87ef,0x0497a97b,0x1b314d3c,0x07fd10f8,
        0x091bf579,0x12776b7d,0x006af5aa } },
    /* 14 */
    { { 0x10c91999,0x1085b4c8,0x16012476,0x09688054,0x020900a2,0x0a5a5c66,
        0x004cf802,0x0b4cd488,0x005fe347 },
      { 0x193e7b4b,0x07c655ef,0x08fe46ac,0x16a034f8,0x06263292,0x04d7668f,
        0x04590ba2,0x011d9fd5,0x00b544e3 } },
    /* 15 */
    { { 0x16ddfdce,0x03c63748,0x045e7999,0x0522cdf1,0x067e12c3,0x173b26a7,
        0x082d3a35,0x17b4d618,0x00e0b6b2 },
      { 0x1b7efb57,0x09896f95,0x031001c3,0x181bbcf2,0x1c9441aa,0x1b56b3cd,
        0x1dd3e40c,0x1bc4b4c6,0x0071c023 } },
    /* 16 */
    { { 0x1fe20925,0x15461225,0x173a19d8,0x0335871f,0x0706391c,0x12eaee9c,
        0x13d96a5a,0x1a843a64,0x0061d587 },
      { 0x037173ea,0x03b39d15,0x1de2d97a,0x090010a6,0x0b43e238,0x020f02dd,
        0x1ef843e1,0x0248c43d,0x00fa11fe } },
    /* 17 */
    { { 0x0cb19ffd,0x0448f959,0x048f08c7,0x151ab763,0x1ca8e01b,0x1eb3c562,
        0x1b72db40,0x0983e277,0x00586eb0 },
      { 0x07e8ed09,0x01ae3729,0x067b7883,0x03467830,0x052fa1e8,0x0b602b63,
        0x1c449e3f,0x010e10c9,0x0019d5ac } },
    /* 18 */
    { { 0x109a4e1f,0x14cfac09,0x09c01d07,0x1bce37d2,0x08d20ab7,0x1785f7e9,
        0x18fc9a97,0x07eff38a,0x00e7c007 },
      { 0x0ef59f76,0x1b6b31d0,0x1f2c1407,0x1676a841,0x002d4669,0x0fbd3d33,
        0x102b0230,0x1fd8cb67,0x00e08504 } },
    /* 19 */
    { { 0x0031b3ca,0x04c7b46d,0x169b59bc,0x19573dcd,0x046e86d1,0x00fd4a79,
        0x1ad16ff6,0x104b6132,0x0078f018 },
      { 0x1a25787f,0x1f77ef21,0x132b26ed,0x0df01a3b,0x1fc36801,0x043bd9ad,
        0x11e833a9,0x170fd28e,0x0043a773 } },
    /* 20 */
    { { 0x12b533d5,0x12bbb9a6,0x0f777018,0x1715ed43,0x0c293673,0x1e4d53cf,
        0x1ac55df9,0x0a38764c,0x00bb6de6 },
      { 0x165259b3,0x1f4981d5,0x0e9d2039,0x015fa7a0,0x0fc27d6a,0x01e8cd9e,
        0x066f16b2,0x134ba317,0x0060b461 } },
    /* 21 */
    { { 0x1ae5aa1c,0x0b51c708,0x19cd962f,0x0eca5693,0x187edb8b,0x000a772f,
        0x1f342c4c,0x1655dd7f,0x009d0f27 },
      { 0x1a730a55,0x1492318b,0x0ef20eb2,0x0ab65fbb,0x19a719c9,0x0ff05600,
        0x12341f07,0x0da6add8,0x00244a56 } },
    /* 22 */
    { { 0x0acf1f96,0x0d81ca57,0x1309c71b,0x02455204,0x1d3b99f2,0x160dc165,
        0x1da4989a,0x10e6b03d,0x0045e58c },
      { 0x038f9dbc,0x1ffa3ced,0x02281034,0x15e28dd1,0x0bed7a8a,0x0fd92370,
        0x1e92516b,0x03983c96,0x00c040e2 } },
    /* 23 */
    { { 0x0f8117b6,0x03d78003,0x08d50ce1,0x12d3fee7,0x075eb651,0x1abb0eca,
        0x1b1d20ac,0x12ed058d,0x001cdf5c },
      { 0x11f04839,0x0dbbada0,0x1785a61f,0x1d59e891,0x132197db,0x0ee8db85,
        0x1cf6ca48,0x1f1525bf,0x00046755 } },
    /* 24 */
    { { 0x1ce8ffcd,0x04562e95,0x1986a0b3,0x0789165f,0x0d6c70d5,0x10b93901,
        0x17cfdbc5,0x02277074,0x00046e5e },
      { 0x18007f01,0x1dc7fb26,0x1d0c60f9,0x03de24b5,0x1a03c7fb,0x0f531af0,
        0x016c1171,0x186607a0,0x006e0106 } },
    /* 25 */
    { { 0x08dd73b1,0x0639ac24,0x17b43652,0x00e11f32,0x02ab7767,0x0f5462b5,
        0x1c7ce0e1,0x1dbd2039,0x00442594 },
      { 0x12d4b65b,0x07d51648,0x12430dfe,0x0468772d,0x18d1f94c,0x1250af4b,
        0x1a3b4c9b,0x0a2985dc,0x00a796fa } },
    /* 26 */
    { { 0x023addd7,0x0cfdb024,0x19a4eccd,0x14c307ca,0x13c809e2,0x1bc71e5f,
        0x1ba7e216,0x1538d2ec,0x00e4ad2d },
      { 0x0e048a61,0x0bfbfa14,0x04b6680d,0x1a331981,0x0d8ef082,0x0d7a601f,
        0x050ff0e8,0x08d86f6a,0x00c5e940 } },
    /* 27 */
    { { 0x0be75f9e,0x1b529c61,0x048e9e11,0x0353d196,0x1c04b6fd,0x06f85884,
        0x1d1f6179,0x15fb68c8,0x0063283d },
      { 0x1af2df15,0x139467bd,0x1669fd33,0x0588aa15,0x0bcc3e59,0x1356f41a,
        0x04e3eac8,0x15633035,0x0068bd19 } },
    /* 28 */
    { { 0x1887d659,0x04756a88,0x164c16b0,0x09abe966,0x14fe3337,0x14c0e7f3,
        0x1f5a5a61,0x1ea78dfb,0x00495292 },
      { 0x1acec896,0x143c64f0,0x16d12112,0x096421d8,0x160a7d96,0x1bf13326,
        0x00dd9a5b,0x01a4c06d,0x000ec753 } },
    /* 29 */
    { { 0x0d2687bb,0x0d09d02d,0x0b887e8b,0x1076d5e6,0x0607ba1f,0x0f7a8eea,
        0x1c2ce43d,0x14cc90c7,0x000f6207 },
      { 0x0f138233,0x0b3f1dd8,0x0aa9c62f,0x0d72d84e,0x088aedd6,0x02039376,
        0x173e3b40,0x0e411dad,0x00ff0db0 } },
    /* 30 */
    { { 0x0c95d553,0x04fd080a,0x1a02a29d,0x00a5faba,0x1566fa44,0x018bff9d,
        0x1a8c60ed,0x07910e81,0x00313b51 },
      { 0x08d11549,0x00171560,0x17b8872d,0x1dc21769,0x0320e071,0x03eea3f9,
        0x1e049ae6,0x1f30de33,0x002d3abc } },
    /* 31 */
    { { 0x015581a2,0x0144280c,0x08846bd3,0x14daacc6,0x12e999a0,0x1d078655,
        0x137c66e9,0x021bdb31,0x00c036fa },
      { 0x01fbd009,0x0d7045d6,0x1456058a,0x1163200d,0x00d8f0b6,0x193bcdcf,
        0x06530bac,0x1896da80,0x00a6b2a2 } },
    /* 32 */
    { { 0x0d3549cf,0x019f287b,0x135997b5,0x06d2dff5,0x1fcb46f3,0x1ed66708,
        0x0181a56f,0x0a55ef93,0x00810ee2 },
      { 0x1159bb2c,0x0a287f0b,0x02cd5ed9,0x1f7d7ceb,0x1ea72f7d,0x1f3a6b4f,
        0x1d14ac15,0x0f524e62,0x00d48571 } },
    /* 33 */
    { { 0x10cb5a98,0x0ba0d457,0x0c442fc4,0x151f263e,0x02adfd3d,0x1165d59c,
        0x01386653,0x14e5f34c,0x006a6045 },
      { 0x02b2411d,0x186069fd,0x03a5b805,0x1d707ca2,0x1b3ccbe0,0x0fb9c432,
        0x1e40ef32,0x1f5f3c2a,0x00d3e45c } },
    /* 34 */
    { { 0x083f7669,0x10fb4ddf,0x01df5af3,0x115d04e5,0x0278d09f,0x172a1922,
        0x06725522,0x1bdc7858,0x00207755 },
      { 0x0fef1945,0x1deb0ecb,0x0b4a30e1,0x0279df62,0x164aa188,0x08eb396f,
        0x00367ef3,0x1cae2a96,0x0048dc5e } },
    /* 35 */
    { { 0x17e5a199,0x11bc85ff,0x0732edc4,0x1f719f31,0x19c79e0e,0x15ff0528,
        0x111709e8,0x1dbbfede,0x00f2fb0a },
      { 0x10b5025f,0x0e04abaf,0x1ea7c890,0x0a87ae81,0x1fbd0550,0x04569c05,
        0x14963e8f,0x02bb651a,0x00a13e90 } },
    /* 36 */
    { { 0x02b65cbc,0x0fbd1a85,0x119089be,0x0972e454,0x107a10b0,0x1120f11f,
        0x09bc9973,0x160292ea,0x002bf0d6 },
      { 0x0b216fb7,0x1ea6e9fa,0x17689ab4,0x0f70cff7,0x0505cf7d,0x1c1fb384,
        0x027ebade,0x0b42c5fd,0x0042a94a } },
    /* 37 */
    { { 0x0aadf191,0x0235685f,0x089a35d6,0x1491204b,0x1c1f60f8,0x182824a6,
        0x18f7a180,0x0d38cbdb,0x002c2dd9 },
      { 0x13849c17,0x0810b8ec,0x0894375b,0x0911743b,0x05485460,0x03831e1d,
        0x16f12043,0x03e858ad,0x00f437fa } },
    /* 38 */
    { { 0x0a0f7dab,0x1506b8a2,0x1dba6b1a,0x092f262e,0x197860f0,0x10287af9,
        0x0aa14b02,0x066a8e0f,0x00aaf45b },
      { 0x018d364a,0x0f1be19e,0x125c5961,0x17360c7c,0x05444d40,0x0b408af6,
        0x0af3d05c,0x01be9e4e,0x00cdf631 } },
    /* 39 */
    { { 0x0ea8b7ef,0x039e311c,0x0f08a1dd,0x126a310b,0x08e3408e,0x13b915ed,
        0x1fc90655,0x175b53c5,0x00f0d008 },
      { 0x0414d3b1,0x089338e9,0x067a9d8a,0x0a930b60,0x1cbdbb37,0x1cb6a29d,
        0x0e2d7186,0x1eb9510f,0x005bd5c2 } },
    /* 40 */
    { { 0x149a3154,0x187a34f7,0x0acba6bb,0x0b4b2adc,0x04a9c3e8,0x160f5549,
        0x1c6516ab,0x191413c8,0x00aa12df },
      { 0x0df69f1d,0x1793913a,0x1fd79cc9,0x09905945,0x1dd44e0e,0x0739dbd4,
        0x0406e763,0x0e7c9195,0x006c036e } },
    /* 41 */
    { { 0x0f6e3138,0x07d70950,0x0b4d1697,0x0dde004b,0x12bc5696,0x0325a2b3,
        0x1892264f,0x0b12d5f7,0x00292ff6 },
      { 0x1e213402,0x09286a22,0x04b27fb5,0x101c4e87,0x072e8f65,0x1cbfed0e,
        0x09d825ec,0x1206236e,0x00644e0c } },
    /* 42 */
    { { 0x047153f0,0x0f210f0d,0x01063278,0x1876f324,0x17672b86,0x0743b82e,
        0x09de4ef7,0x127956f3,0x00f25ae7 },
      { 0x0d869d0c,0x198ca51b,0x01b09907,0x0b910493,0x0945e9d5,0x0f5184b7,
        0x08f927ed,0x0a627b61,0x0039b8e6 } },
    /* 43 */
    { { 0x16fd2e59,0x1baa1005,0x157263cd,0x0580cd24,0x0573935e,0x190d0715,
        0x0c1b676a,0x05e1e33b,0x0039122f },
      { 0x03cad53c,0x1de70f00,0x1705f8f3,0x16581fcc,0x13877225,0x18e94d50,
        0x1e35caeb,0x1f19d01f,0x008de80a } },
    /* 44 */
    { { 0x007bbb76,0x1df546c9,0x1e09d62b,0x18fcf842,0x036b1921,0x1ba58e02,
        0x10137e8a,0x00c5c6d1,0x00871949 },
      { 0x03993df5,0x0fc945dd,0x0cf49aad,0x1aeb6be7,0x15050639,0x13c542da,
        0x1784046a,0x0d4b6e9f,0x00fc315e } },
    /* 45 */
    { { 0x08d6ecfa,0x10fea0d7,0x1b1fe195,0x1889ec35,0x0741d5f8,0x153da492,
        0x02226114,0x15bdc712,0x00e6d4a7 },
      { 0x0593c75d,0x02a9768a,0x09c45898,0x0e1b49ba,0x0c7db70a,0x0f49bdd1,
        0x195f4abb,0x13537c55,0x0035dfaf } },
    /* 46 */
    { { 0x0a736636,0x1cab7e6d,0x0b2adf9a,0x0a3b2f5c,0x0996609f,0x1fa0879a,
        0x14afec42,0x1ae39061,0x001da5c7 },
      { 0x1cce6825,0x020f2419,0x15cf0ed7,0x1a231ff2,0x036b815a,0x0963f918,
        0x075a8a15,0x1fbb7e97,0x007077c0 } },
    /* 47 */
    { { 0x06b9661c,0x1b1ffc6a,0x0b3f5c6f,0x1fa6d61a,0x1f8f7a1d,0x10a05423,
        0x19100dcf,0x05dca1df,0x0053a863 },
      { 0x096d8051,0x0bb7fb43,0x13d1a282,0x18192b8e,0x026bddae,0x06e1af27,
        0x13058a65,0x0da69c3f,0x00028ca7 } },
    /* 48 */
    { { 0x1c9877ee,0x08ea3ee7,0x074000b4,0x06c42100,0x060b6c8b,0x008baa61,
        0x011b400b,0x1b0d2c5e,0x0004c17c },
      { 0x10daddf5,0x0cde84a5,0x1395701b,0x046aea49,0x003b5bea,0x0b73396d,
        0x11d198cd,0x1d3fdb2e,0x00f7ba4d } },
    /* 49 */
    { { 0x0be1263f,0x06dfd1a7,0x0b9f39b4,0x0c6e6ae3,0x0f523557,0x02a9c153,
        0x11074910,0x000a4263,0x00e31f96 },
      { 0x0a6b6ec6,0x0ddc90b7,0x10bf1134,0x03a25ce7,0x0a29437a,0x1f5644e8,
        0x11ef0439,0x0b39c69a,0x00aa3a62 } },
    /* 50 */
    { { 0x16f3dcd3,0x1e7cefa9,0x0fdcd83e,0x1bdaa1a5,0x04f5b6ce,0x087d6fa8,
        0x0bb9245c,0x0c4fcf3b,0x002398dd },
      { 0x0d09569e,0x1a382d1b,0x127dda73,0x0c3376a2,0x0034cea0,0x01bb9afb,
        0x0843fe70,0x1643808c,0x005717f5 } },
    /* 51 */
    { { 0x01dd895e,0x1f114e49,0x10a11467,0x030a0081,0x17ecd8e5,0x091c8eb1,
        0x037be84f,0x0ac1c785,0x00660a2c },
      { 0x167fcbd0,0x06544576,0x0a7c25a7,0x0e48f01d,0x12b4dc84,0x1a40b974,
        0x114ccacb,0x0989ea44,0x00624ee5 } },
    /* 52 */
    { { 0x1897eccc,0x0aa4e726,0x06202a82,0x13a3b27f,0x07c204d4,0x1211821d,
        0x0f01c8f0,0x1f7257bf,0x004f392a },
      { 0x1de44fd9,0x0b4fc7d3,0x0cc8559a,0x19f7c8af,0x0bc3cb66,0x14019b47,
        0x06736cbe,0x0ef99b67,0x008a3e79 } },
    /* 53 */
    { { 0x06c4b125,0x0f0c40f8,0x18f2a337,0x09c601ed,0x013e9ae3,0x0cef2e3d,
        0x1013bda6,0x046e1848,0x003888d0 },
      { 0x04f91081,0x11401ab2,0x0055411d,0x1f9ec2be,0x0d36e3d9,0x16e43196,
        0x0cd8609f,0x08e30204,0x00a5e62e } },
    /* 54 */
    { { 0x0facd6c8,0x1412f719,0x0f2f1986,0x18c6a8a9,0x19931699,0x16fbcc6f,
        0x0b70338f,0x1cc8cd4b,0x002c4768 },
      { 0x10a64bc9,0x1a37fc64,0x1de7d72c,0x14c041c8,0x1e884630,0x08325e02,
        0x0a836527,0x083f3cca,0x007b5e64 } },
    /* 55 */
    { { 0x1d28444a,0x0b4a1160,0x04da8e48,0x0d8bb17c,0x07fcee99,0x17f2fd86,
        0x11288e1e,0x196191ae,0x00b8af73 },
      { 0x138b86fd,0x1ef41d51,0x02973fd7,0x07e2b14b,0x09433fee,0x07b79056,
        0x025727ba,0x0befe7e1,0x00a03639 } },
    /* 56 */
    { { 0x010f7770,0x039e35dd,0x0a838923,0x02db0342,0x02b9fa6f,0x1b4128de,
        0x14cc4037,0x0030ebf6,0x004be36b },
      { 0x1fb56dbb,0x11304374,0x19e93e24,0x1fdf160f,0x12f20306,0x0602b36a,
        0x0303bab3,0x10e37b80,0x008cbc9a } },
    /* 57 */
    { { 0x00dac4ab,0x098c4ae6,0x0bfc44b8,0x094880e2,0x0ee57a87,0x173e350e,
        0x17e18cca,0x07c18106,0x0044e755 },
      { 0x1734002d,0x0a81fffb,0x0d10971b,0x0b971616,0x138b59d3,0x013b0743,
        0x106257dc,0x074bd71f,0x00470a68 } },
    /* 58 */
    { { 0x10513482,0x0dbb0ee4,0x1a49daa0,0x0e405403,0x13083028,0x00f70673,
        0x1bbf3691,0x1218c7b8,0x00164106 },
      { 0x0d06a2ed,0x081a5033,0x06c402fd,0x1aee8a31,0x018c9dd4,0x173955c1,
        0x0d3f6452,0x1faf5797,0x00d73479 } },
    /* 59 */
    { { 0x1ad4c6e5,0x16f7d8b2,0x01b4135f,0x19e11eb6,0x1cb14262,0x0dd8c2ba,
        0x19ac4bb5,0x1c60ee2c,0x00816469 },
      { 0x161e291e,0x1d5cebca,0x17859875,0x1b5e4583,0x00513eb9,0x13f589af,
        0x1e73d260,0x047e1ba7,0x000a36dd } },
    /* 60 */
    { { 0x01d5533c,0x0c69963a,0x0118a3c2,0x1eb53d0d,0x1bd117c5,0x1456f1a4,
        0x0460e688,0x1adfb756,0x00e331df },
      { 0x0bcc6ed8,0x08055b43,0x1e898394,0x01877bde,0x050d7716,0x0cd3de74,
        0x0e26418f,0x054925c6,0x00d3b478 } },
    /* 61 */
    { { 0x13821f90,0x0a4db747,0x1adeab68,0x1bb3dacd,0x1311692e,0x14a98d00,
        0x16f42ed9,0x0b4990d4,0x00728127 },
      { 0x13ff47e5,0x01c2c7be,0x00591054,0x0c2d78c2,0x19bb15e1,0x188d3efe,
        0x01658ac3,0x0fd9c28a,0x002c062e } },
    /* 62 */
    { { 0x0159ac2e,0x1b7ccb78,0x16c9c4e9,0x1cee6d97,0x06047281,0x09440472,
        0x1bc4ab5b,0x1f2589cf,0x00282a35 },
      { 0x00ce5cd2,0x01aa58f6,0x1e708a67,0x13df9226,0x0c11ecf9,0x179c1f41,
        0x0af664b2,0x026aa9a5,0x00c71cd5 } },
    /* 63 */
    { { 0x09b578f4,0x042ef4e0,0x0bfe9e92,0x09c4b1c7,0x02f1f188,0x18dbac8c,
        0x0e8e3dda,0x0819e8fe,0x00c50f67 },
      { 0x174b68ea,0x0e256f99,0x0597f8aa,0x0de646d3,0x13050a40,0x111142d2,
        0x0370be1a,0x14e4252b,0x00b9ecb3 } },
    /* 64 */
    { { 0x14f8b16a,0x17c20877,0x1ec99a95,0x0835fd88,0x087c1972,0x15c736ce,
        0x0c6c2901,0x0059a855,0x00803f3e },
      { 0x04dbec69,0x18184d40,0x0eb417df,0x170bee77,0x0197fa83,0x1939d6c7,
        0x17071825,0x01ca0cf5,0x00c09744 } },
    /* 65 */
    { { 0x0379ab34,0x0352b796,0x077e3461,0x1c0d1708,0x068efa8e,0x022c8bb6,
        0x1cc080c5,0x1ab22be3,0x00f1af32 },
      { 0x1d75bd50,0x0e1ba98a,0x0bd9ef26,0x19ff75ee,0x1723f837,0x120c246b,
        0x122c184e,0x061c5a83,0x0023d0f1 } },
    /* 66 */
    { { 0x141500d9,0x0bd5b76f,0x0fab6a21,0x1215cbf9,0x059510d8,0x032444b9,
        0x0b754bfa,0x1ad8147f,0x00b0288d },
      { 0x050bcb08,0x09907983,0x175b85a1,0x1ec626d2,0x1aa7671a,0x1053dcc4,
        0x0348c7d4,0x09fe8119,0x00ffd372 } },
    /* 67 */
    { { 0x1458e6cb,0x1cb47325,0x1e974a14,0x1b5a4062,0x15f56992,0x1705bd53,
        0x1b7ce052,0x095af184,0x00f5590f },
      { 0x0f0ba55a,0x1e125e9e,0x1de2eb83,0x08e49418,0x1674a0fc,0x0327b41d,
        0x088073a6,0x0a9edee9,0x0018d6da } },
    /* 68 */
    { { 0x15be5a2b,0x0c9f112e,0x0d3cf1bb,0x0f3306b2,0x06ffc6fe,0x04931131,
        0x05a90c50,0x1b2f3204,0x0050bbb4 },
      { 0x057ec63e,0x1c0c8e37,0x07736c8d,0x04588030,0x0e0f6654,0x04cd811b,
        0x070d06a0,0x03003fc9,0x002b1001 } },
    /* 69 */
    { { 0x1b391593,0x0345ae2c,0x009c3f3f,0x0beb44b3,0x0dcbbc38,0x19d568cd,
        0x1831c513,0x13307f75,0x00dd5589 },
      { 0x14b82ff4,0x1dc45c73,0x19cd3264,0x007880e3,0x0322ad2e,0x0f57a1e0,
        0x010669ea,0x0a2293ac,0x00e6e4c5 } },
    /* 70 */
    { { 0x1e9af288,0x0fb2add8,0x0b6a4c55,0x1c34c9ef,0x020e5647,0x1f25e594,
        0x1bfd0da5,0x1620fdaa,0x0051e00d },
      { 0x171c327e,0x1e8b4dc3,0x05b0ab50,0x1b641695,0x1477929c,0x08fa9ef5,
        0x05df01f5,0x08293052,0x00e22f42 } },
    /* 71 */
    { { 0x035f1abb,0x0a2f47a3,0x14e21d33,0x18196ad0,0x0034d7ed,0x160fdad4,
        0x0327251c,0x07aa5b89,0x00f70937 },
      { 0x08af30d6,0x00cb35dd,0x0deda710,0x1ebe95e2,0x1c47e95b,0x0b1549b0,
        0x0c44e598,0x111ce4eb,0x00bd52d2 } },
    /* 72 */
    { { 0x1c5fa877,0x18aae3d4,0x0e8f522a,0x15ace4fa,0x189d817d,0x1fcf39e8,
        0x1e990fd0,0x1c99154e,0x00a0d0f8 },
      { 0x0c94f92d,0x1df57ec6,0x1376ce82,0x11917c18,0x0ba14d81,0x12fc5c17,
        0x08008b31,0x18f28dad,0x00a56c78 } },
    /* 73 */
    { { 0x0dd09529,0x0b11c8d8,0x0b77f3ca,0x1c1d4c7b,0x1f481803,0x1a8fadad,
        0x19e8b1dc,0x1f0e6346,0x00d8befd },
      { 0x1c0157f4,0x1c8cea17,0x1239942a,0x195daffd,0x08b0af51,0x05a0016a,
        0x11e337e7,0x14b9d3ec,0x00854a68 } },
    /* 74 */
    { { 0x03506ea5,0x01afb3db,0x1f8359b7,0x0d891349,0x1cd4d928,0x0e9dff4a,
        0x0a54fc40,0x0173108d,0x005cacea },
      { 0x1ceac44d,0x086fb064,0x13470eaa,0x0535e86a,0x1babe3db,0x1ef456ae,
        0x1ea42374,0x0246bc9d,0x00e4982d } },
    /* 75 */
    { { 0x034cd55e,0x18825116,0x00344c88,0x12b7664d,0x1d943586,0x0d7d0fd0,
        0x1267ecd1,0x1ec2d640,0x008046b7 },
      { 0x18e7d098,0x099ac0f1,0x1bc2dc2d,0x0c3d1be8,0x178c4d7f,0x14f52265,
        0x1d54c37a,0x0f721055,0x00eb17ca } },
    /* 76 */
    { { 0x16a145b9,0x1a8dacc3,0x0f1c7b05,0x1ed61f83,0x115bba5c,0x1ab29c93,
        0x04c74f80,0x175f56bc,0x00097b00 },
      { 0x165f69e1,0x1336474a,0x0f94666a,0x11eeb56b,0x1d98477e,0x1d08ed27,
        0x127980ce,0x0f75fb79,0x00f95c74 } },
    /* 77 */
    { { 0x1ebae45e,0x0c780e9d,0x0f1a5555,0x17d3e189,0x04fc6a8e,0x02d8ede3,
        0x00debadc,0x03cacddb,0x00351260 },
      { 0x1a1161cd,0x19b78f0f,0x197be1e4,0x1571aa98,0x121e5328,0x17713927,
        0x0dad1d5f,0x046c0d15,0x000ef971 } },
    /* 78 */
    { { 0x14ca4226,0x12cc67ba,0x190b2380,0x1bc271f0,0x017905ee,0x1fba2347,
        0x12552258,0x066769f7,0x00fc16d9 },
      { 0x07c800ca,0x14b7d98f,0x1e2b6aaf,0x00c6624c,0x1e8b5138,0x024bb7f9,
        0x085cf589,0x1e372baf,0x0014ca4a } },
    /* 79 */
    { { 0x1d2f81d5,0x123b8dd5,0x1df4659e,0x1f3ad203,0x1c9071a5,0x1f7be56c,
        0x0c776262,0x0c7eb384,0x004057b0 },
      { 0x09c05c0a,0x1fec17f4,0x1037e16f,0x0238de3b,0x016dbe49,0x065751ad,
        0x0c4cefbf,0x0c9e2661,0x001c3b5d } },
    /* 80 */
    { { 0x00ec21fe,0x1f0a5ff4,0x156fa097,0x1c22d584,0x05d67f6c,0x0d0397a5,
        0x0ebe62f1,0x091b6fcc,0x00fad271 },
      { 0x09ab05b3,0x0605b561,0x0946b9a4,0x1350789c,0x0de7d37a,0x043ae155,
        0x0a1029f7,0x1c73e1c3,0x0077387d } },
    /* 81 */
    { { 0x056c0dd7,0x14f6624d,0x021b1d07,0x1ff9b08c,0x1aecea5c,0x0a047a82,
        0x11fa3de8,0x1817de18,0x00b37b85 },
      { 0x0c0e6a8f,0x0cb5b726,0x0e23c8cd,0x1a977ed6,0x0ef4efd6,0x09fd61ce,
        0x0356ae91,0x191f3ec5,0x009c135a } },
    /* 82 */
    { { 0x04e35743,0x15519014,0x08f37bcc,0x1ad5630b,0x19819320,0x18bb0ef8,
        0x147ee086,0x03f88670,0x00572136 },
      { 0x11fc9168,0x186d9b53,0x17100f07,0x1174e6bc,0x0d8f55f9,0x143f1bde,
        0x06f7d932,0x193cd762,0x00dcbac3 } },
    /* 83 */
    { { 0x0518cbe2,0x00eccb42,0x07ac13bc,0x05f83139,0x1eebfd24,0x11e3f23f,
        0x0189c9d9,0x13c5ac4d,0x00b8c1c8 },
      { 0x08e1d569,0x0d2c5eee,0x16233414,0x1013916f,0x131eb563,0x1fecf88f,
        0x0b509b09,0x1b45f284,0x005d23bb } },
    /* 84 */
    { { 0x15c8f8be,0x10e394a4,0x1cd8afc2,0x03890077,0x1d4ac296,0x0201efb1,
        0x04027906,0x19723d9d,0x00c109f9 },
      { 0x18945705,0x1684ae82,0x1ae17030,0x107b2dbb,0x0449bb90,0x15c6bd20,
        0x1b8611a4,0x09e5ddc3,0x009bc334 } },
    /* 85 */
    { { 0x02913074,0x0ad71ab2,0x0950ac43,0x12364e91,0x0732a554,0x1332d988,
        0x13051a72,0x0a4be349,0x0029591d },
      { 0x184f983f,0x1b7adb5d,0x17e13879,0x1dde833e,0x0a189be7,0x0a4b405d,
        0x0cb04803,0x03e31de6,0x00637655 } },
    /* 86 */
    { { 0x162976cc,0x0d2f8a72,0x1c4b0e2f,0x1947cc1d,0x0985222b,0x18323665,
        0x01eaefe8,0x19011c53,0x00bdb79d },
      { 0x0b06a772,0x0965ae4e,0x14db73bf,0x08eb55fc,0x15db838f,0x10113e15,
        0x052b0a8f,0x0035ba78,0x008ee860 } },
    /* 87 */
    { { 0x04ade873,0x1f4b4c0d,0x1ee92332,0x13549b89,0x14ba57ee,0x144cad02,
        0x092cb3b8,0x0f4deef5,0x0092e51d },
      { 0x1190a34d,0x045d7d43,0x0f47b465,0x11eeb7ed,0x11144d69,0x13718657,
        0x0aab403b,0x0de14ad5,0x005182f8 } },
    /* 88 */
    { { 0x1a4cc99c,0x1d310963,0x1b67287e,0x0136d07c,0x18c5aff6,0x13e5ad64,
        0x1bc976ec,0x0ba80e74,0x0091dcab },
      { 0x1f575a70,0x0db661ea,0x0361fe80,0x06c272df,0x017360cb,0x074644cc,
        0x1cac5975,0x1b72f2e9,0x0017a0ce } },
    /* 89 */
    { { 0x076c8d3a,0x0430f150,0x03e492ce,0x155a7242,0x035d9701,0x157209d4,
        0x1d065343,0x0d8fe99b,0x002e8ce3 },
      { 0x037a862b,0x0939ed58,0x19323ea4,0x15376ec1,0x0f2dd01b,0x09c419dd,
        0x03cfe591,0x19669ecd,0x00f4ccc6 } },
    /* 90 */
    { { 0x11f79687,0x077a92e7,0x1bea0551,0x12a92b25,0x18d297c5,0x0ba0d2e3,
        0x0f27848c,0x111341be,0x00ac0db4 },
      { 0x1f01747f,0x15fe388e,0x05f7c4e1,0x1726b1de,0x16bb5592,0x0727ae65,
        0x128b9620,0x0c32992e,0x0095a64a } },
    /* 91 */
    { { 0x015a4c93,0x160f7ed6,0x1614505c,0x0d36e704,0x10bad402,0x1d8e0b65,
        0x19ddaa37,0x17452420,0x00231e54 },
      { 0x0ae6d2dc,0x186fc8bc,0x044a4629,0x154c7e72,0x172234d6,0x1935af2d,
        0x0787d89d,0x065b14e6,0x00ab0be0 } },
    /* 92 */
    { { 0x0d131f2d,0x0bd6874c,0x013c4042,0x1e13c676,0x1a748637,0x10cb6af4,
        0x19e46b21,0x10059ed4,0x00f1bcc8 },
      { 0x08daacb4,0x0e348a07,0x1d940249,0x1c80aac1,0x137a63c4,0x047e23bc,
        0x09c56473,0x0d2b5d76,0x00851694 } },
    /* 93 */
    { { 0x11dcf593,0x11ae0a1f,0x062f8ef7,0x00565360,0x19d3d782,0x16e14dee,
        0x1763a736,0x1a5b55aa,0x008f67d9 },
      { 0x1481ea5f,0x0088b2b3,0x13164321,0x05bbd3c6,0x13fa8e7d,0x01fa0282,
        0x0d77ff75,0x17380e51,0x00f84572 } },
    /* 94 */
    { { 0x17af71c9,0x10d3d38c,0x1cd95957,0x092888f4,0x15063a14,0x1703870e,
        0x106686d2,0x020c2d65,0x00edee27 },
      { 0x11734121,0x1781a7a8,0x097a7c2c,0x18dcaa94,0x02ecf1ca,0x0479d206,
        0x1fd23705,0x13689d7a,0x009fd27e } },
    /* 95 */
    { { 0x16e2cb16,0x063b2c57,0x16466d8f,0x16fa59fc,0x15583e3e,0x0c0b0b46,
        0x0e1d6a31,0x16d2b1fe,0x00a40c2f },
      { 0x1edcc158,0x04f62b07,0x1c8c15a3,0x10098cab,0x07e127ad,0x13824d18,
        0x1b3f64e5,0x170fb8db,0x0099bc9b } },
    /* 96 */
    { { 0x127dafc6,0x054a90ec,0x02734661,0x03f6d2b8,0x06dde52c,0x00d07c9b,
        0x19927656,0x01742daf,0x009abe21 },
      { 0x08915220,0x0057c252,0x1605b192,0x062ed49b,0x1ca5afa7,0x1cc38b40,
        0x12c31f54,0x0af0fe68,0x007881c2 } },
    /* 97 */
    { { 0x00bcf3ff,0x19ccda8f,0x1fdd3da4,0x05978a24,0x1d9680d0,0x12d16e80,
        0x05023ed1,0x033461d1,0x0015e6e3 },
      { 0x1e0e05f4,0x036b7069,0x16210119,0x0f7bb886,0x050d3fad,0x03e8e27c,
        0x0b3af987,0x19e3222e,0x000e55fa } },
    /* 98 */
    { { 0x18787564,0x14ecc037,0x1a17399f,0x062e4263,0x1e8d61a3,0x0c655c0c,
        0x15ddac05,0x0ecdfd2c,0x00d73d09 },
      { 0x1eb7206e,0x1241a128,0x062ed090,0x12521f8c,0x0a520a51,0x1c2caf18,
        0x142d772e,0x0e91e2b4,0x009250a3 } },
    /* 99 */
    { { 0x1e577410,0x17f847c5,0x1dea31b2,0x011406a0,0x063a4fd4,0x1944f605,
        0x102fc7d8,0x10583991,0x00774140 },
      { 0x0b0991cd,0x0d207d37,0x1f70a581,0x1410cc93,0x0fd40c1c,0x11e3d992,
        0x02e4e9a2,0x09a25d64,0x008cb04f } },
    /* 100 */
    { { 0x0906171c,0x0e1682ab,0x09030fec,0x07d39b60,0x06841907,0x15a7ec48,
        0x0d476e39,0x1de8e247,0x00e4e429 },
      { 0x18ec36f4,0x1c6ea9e1,0x12da89c2,0x05b803fe,0x09a48f9d,0x1703c3cd,
        0x15497419,0x1fe78dcc,0x0037bca2 } },
    /* 101 */
    { { 0x1f562470,0x06971e3e,0x0592b253,0x04e54581,0x193be44f,0x0efcc063,
        0x08a9f1b5,0x1b860056,0x0059913e },
      { 0x1750592a,0x109cd41a,0x00f7809e,0x003b01cf,0x1d64f99e,0x01baf502,
        0x089b3e30,0x0956027c,0x0043786e } },
    /* 102 */
    { { 0x1e56b5a6,0x1995876c,0x1f1a3e7f,0x01b34db3,0x046a7075,0x1422acbc,
        0x19ebb057,0x1316fcf3,0x008638ca },
      { 0x0afc24b2,0x1ad704b0,0x0b3a3c8b,0x131d5e9b,0x1a78f053,0x0ee85765,
        0x1bc0edd9,0x0d4f6754,0x001ecdd3 } },
    /* 103 */
    { { 0x0c5ff2f3,0x09d66b13,0x1cea5e17,0x0a2d8050,0x10d54a2d,0x04fd6908,
        0x0cb6b653,0x10ba8b3e,0x00d85d0f },
      { 0x10b11da3,0x1b805c68,0x00c63127,0x0458614f,0x0decdd2c,0x047a4904,
        0x118955a6,0x18769da7,0x00a04f19 } },
    /* 104 */
    { { 0x0d7f93bd,0x03c92647,0x0bd47d82,0x0958ba72,0x171afcb6,0x1985410d,
        0x02c1f2b8,0x1d4b812a,0x0092b2ee },
      { 0x05b6e235,0x0d6264a4,0x0db03c21,0x19495252,0x08891ab2,0x1359f028,
        0x1db203ea,0x042b0684,0x001ee782 } },
    /* 105 */
    { { 0x063e79f7,0x10517007,0x067641a9,0x01cf65e7,0x1c09df59,0x02a53303,
        0x05424084,0x1b0af4dc,0x00f3f2ce },
      { 0x110d9b55,0x0028879f,0x19099208,0x1f9f59b0,0x10e7c9d2,0x0d53f45e,
        0x0843958c,0x0a87b47c,0x000f56a4 } },
    /* 106 */
    { { 0x1043e0df,0x190dffd0,0x001f9b56,0x096d9938,0x0517a6c7,0x17606a54,
        0x098c6995,0x08232d3c,0x00bd8f17 },
      { 0x1eb7494a,0x14dddc35,0x1cee0e22,0x0fa8de8b,0x1a79a156,0x0953d272,
        0x08277de8,0x06a6199f,0x002d1a1c } },
    /* 107 */
    { { 0x106508da,0x0971c09a,0x15e569c6,0x03018943,0x144b3336,0x0ca4bd4c,
        0x091b376d,0x0bd723f7,0x00a107a6 },
      { 0x0f94d639,0x168e8e28,0x162df5f9,0x15e6eb14,0x1ca1c8b4,0x0ac25e9b,
        0x0bc869f1,0x015f0f53,0x00183d76 } },
    /* 108 */
    { { 0x0dde59a4,0x0eb4b888,0x02fbe1ca,0x1b1a0e1d,0x0be78f1a,0x04b1a797,
        0x1d508a6d,0x13b84d3a,0x001d4417 },
      { 0x0390d30e,0x196e067c,0x1a04432c,0x164ea61b,0x0339a0a3,0x0ee295e0,
        0x0988c6bc,0x1852c0da,0x00771f9c } },
    /* 109 */
    { { 0x05040739,0x0cc9f3bc,0x09aa4e66,0x073b7300,0x0fc26445,0x1b797afc,
        0x063b3d03,0x06206c4e,0x0064427a },
      { 0x05428aa8,0x1a796c3c,0x1ed26a13,0x15b87fd7,0x101ac7b7,0x1636f91e,
        0x15b4806c,0x092d5d21,0x0049d9b7 } },
    /* 110 */
    { { 0x035d1099,0x03c6c5e2,0x03468233,0x179a9d1d,0x08a412ad,0x1150165b,
        0x11140b0b,0x0367ec0a,0x009037d8 },
      { 0x074c7b61,0x06dd6138,0x0ff5cb9f,0x006356af,0x15352fe2,0x164b2cb6,
        0x0e718733,0x0d4f980c,0x0008c3de } },
    /* 111 */
    { { 0x16d552ab,0x07ee8107,0x13607c48,0x15ff300b,0x1129156b,0x1e1f489a,
        0x0cbc1bed,0x0848af2d,0x00c69094 },
      { 0x01231bd1,0x1d9d74e2,0x11608145,0x18dd0eb9,0x0a1221ea,0x1bd5fceb,
        0x0b008220,0x00595fc7,0x003fa3db } },
    /* 112 */
    { { 0x05058880,0x1ad1f328,0x0e50fcb5,0x06cbdec8,0x049257da,0x030e7d59,
        0x03fd051e,0x161fb701,0x00c5c4bd },
      { 0x1272b56b,0x1a89f1a5,0x0e410e9c,0x04fd2a23,0x04969c83,0x11befc42,
        0x1ad7f633,0x1288d856,0x002d56db } },
    /* 113 */
    { { 0x1f46ac6b,0x030bc17f,0x08b90949,0x1ef24c0f,0x08de1d19,0x11e204d2,
        0x090bebfa,0x13bca077,0x000f56bd },
      { 0x145cda49,0x1bea7689,0x1bca6744,0x02b1f902,0x03402821,0x12a5575a,
        0x17c79f1a,0x13a22e76,0x004003bb } },
    /* 114 */
    { { 0x00803387,0x1c740c4d,0x12f5010e,0x022bea73,0x17f21ece,0x1046e943,
        0x1e790a5c,0x04540fe5,0x00537655 },
      { 0x08a4182d,0x04c0510d,0x0677de69,0x17a0f464,0x1a2d4a2b,0x05170d0c,
        0x15259d34,0x0b0d8ba8,0x007a056f } },
    /* 115 */
    { { 0x1d8a2a47,0x03592ac4,0x17c9dcd9,0x10529187,0x0d5395b5,0x000755f8,
        0x19d547b0,0x1e2f4344,0x0077d482 },
      { 0x07853948,0x050decac,0x1efffbae,0x102f7ad9,0x01e47a6f,0x002bc034,
        0x0392adbb,0x05656716,0x00411501 } },
    /* 116 */
    { { 0x0de28ced,0x039f87a3,0x04fb11cf,0x1b4ec136,0x063921d5,0x074f372e,
        0x051986e3,0x0e5f7d41,0x00cdf045 },
      { 0x0c53c3b0,0x059e2c5b,0x1ee10f07,0x1c782088,0x1780e97f,0x0570965c,
        0x0427ecae,0x1b52e706,0x00ee703d } },
    /* 117 */
    { { 0x1f57e43a,0x028a8a07,0x0e046e0d,0x0cc1a763,0x0b986d44,0x0effc7a1,
        0x1884aced,0x13b42c59,0x002a0ad8 },
      { 0x0bc277ba,0x072534a3,0x10709d99,0x1192a982,0x16274c78,0x1326655f,
        0x1964506a,0x0cf58568,0x00d62d0b } },
    /* 118 */
    { { 0x0c054ac4,0x0e2ec3d9,0x1f7de20e,0x00b0b3e4,0x128d6570,0x05f9d8c0,
        0x109bb7df,0x1e532384,0x00b39a23 },
      { 0x10b16ae5,0x094250af,0x0dbd46e5,0x140b6342,0x007830c6,0x009bf938,
        0x1314758f,0x12580ce9,0x0004ed00 } },
    /* 119 */
    { { 0x1ae90393,0x1a0c2e8c,0x0f593987,0x0f685294,0x0fc14304,0x00d34c2a,
        0x0e1eb800,0x18202ef8,0x00a0a91f },
      { 0x0e2c831e,0x1851f80d,0x1c9f85bf,0x0d5d0456,0x075b4bb7,0x0450ad18,
        0x11063c4b,0x1113da41,0x00084cf9 } },
    /* 120 */
    { { 0x1ca6becf,0x0c284ef7,0x1fecca36,0x1d5d00fb,0x0e8b92fc,0x0ae223bc,
        0x1df97628,0x164e757e,0x00d57955 },
      { 0x11b5d4f1,0x086d3cf1,0x1e9e8708,0x05e09679,0x1c20baa5,0x1044ee13,
        0x07c75344,0x08405a28,0x008e14ea } },
    /* 121 */
    { { 0x12897042,0x16a81a2f,0x100b12bb,0x0a663e86,0x1fb218d0,0x00ca645e,
        0x05632367,0x06e5549a,0x00597e1a },
      { 0x0f0bd68c,0x193f60d6,0x00925140,0x17c1b956,0x03e846d4,0x06bd64ff,
        0x17a96e72,0x06c33369,0x00ca3f02 } },
    /* 122 */
    { { 0x0170bd20,0x095085ab,0x0fd779d6,0x112fe2da,0x0ade20ea,0x1ff8a259,
        0x1f928cd8,0x0fc61380,0x00bde7fd },
      { 0x18f5432c,0x0b5db695,0x10d112d4,0x1b8397c0,0x15b5a210,0x0f37fc7c,
        0x0660f6c0,0x01c14fba,0x00b623ad } },
    /* 123 */
    { { 0x00c7b65b,0x1adeb3ab,0x0928a269,0x18ab2047,0x06795ab8,0x07e86bd9,
        0x0defe088,0x08cb1d82,0x00d6aa2e },
      { 0x1138bb85,0x055e005a,0x0cea5704,0x03a243b0,0x0a32e8c3,0x18058b81,
        0x04eac93f,0x1c05b98a,0x00111662 } },
    /* 124 */
    { { 0x0fb42b87,0x008a00af,0x1b137fde,0x1ebae036,0x1c129bd9,0x066bd3eb,
        0x03e19bb3,0x197296ea,0x00db3ee1 },
      { 0x134837cf,0x1379ed87,0x15e353ec,0x1da31772,0x0657de7e,0x0fc9be2b,
        0x096574b3,0x084a440d,0x00886a64 } },
    /* 125 */
    { { 0x05b569ea,0x011a67db,0x0846704f,0x022283ee,0x0619e200,0x042ed0ad,
        0x1ef22eb7,0x1d603142,0x00a70cf4 },
      { 0x0c4a6a65,0x127cbd74,0x0d0de3c8,0x0b9e4e02,0x0096036e,0x104f27bf,
        0x0ddef8e9,0x157a2e8f,0x00aa4772 } },
    /* 126 */
    { { 0x1aa60cc0,0x1b3b098b,0x1a0457d9,0x02c6c206,0x1bb5ac79,0x05da5de0,
        0x05d37b66,0x1b861f5f,0x00611a6d },
      { 0x015ee47a,0x073c65e6,0x0365a94c,0x12c5049c,0x1ed882e8,0x0d6f9eec,
        0x1220dbcd,0x1f02c853,0x005cfffa } },
    /* 127 */
    { { 0x1b7a99cd,0x06aa67fc,0x0f116870,0x07733b08,0x139e17bf,0x0847b163,
        0x05300e2a,0x046fb833,0x006e5a6b },
      { 0x0ba5db77,0x1c5a2a70,0x1d8358fb,0x1100ff59,0x08378b7b,0x00633b30,
        0x0f339647,0x11a485b5,0x00481a23 } },
    /* 128 */
    { { 0x15d0b34a,0x1a0bde01,0x09f029f8,0x1670d706,0x162d1440,0x1316d601,
        0x050e3edc,0x099c19bf,0x002c4111 },
      { 0x0d95a0b1,0x1d2e778d,0x1550d88a,0x166f50cf,0x086c9c09,0x06e900f2,
        0x0a5c9b5b,0x17e85ff2,0x0020477a } },
    /* 129 */
    { { 0x18d65dbf,0x1ba8b9e0,0x07b6b60b,0x1f281c67,0x1001c77b,0x0935ee78,
        0x1ad9c08b,0x1358ee72,0x00ac6640 },
      { 0x06261cc3,0x185d9b7e,0x039fa422,0x1ef79232,0x06c10213,0x075d522f,
        0x1e159507,0x0eb98245,0x00ce8e69 } },
    /* 130 */
    { { 0x1c0a67d2,0x1890da0d,0x13492283,0x08ec1488,0x1473762d,0x078eb2cd,
        0x12a03811,0x0ca4a176,0x0008fde3 },
      { 0x048bf287,0x07761ed4,0x0da75bab,0x0c4305a6,0x09482c2a,0x0fee4922,
        0x135cd60b,0x1a4acbad,0x002f7e2f } },
    /* 131 */
    { { 0x03770fa7,0x125c96de,0x0410fe6b,0x1d1ab86f,0x01171095,0x074e8bbb,
        0x0ab953cd,0x05d20ee0,0x00c65be9 },
      { 0x16fd0a40,0x1ac5181f,0x139e12c9,0x1045c779,0x167bfe7d,0x1ac2a7cb,
        0x0ce9eb93,0x08fa2327,0x004bff8e } },
    /* 132 */
    { { 0x00ff1480,0x0a0e90f8,0x1536c5b3,0x11f6fa0e,0x0f3ea2ab,0x0977ddf0,
        0x19f6b207,0x1ccaee52,0x003e4e4a },
      { 0x1c5303e6,0x10c79b69,0x0988e5df,0x13329724,0x0c3c03bd,0x07130992,
        0x00a27b5c,0x1fab1d8c,0x005388ae } },
    /* 133 */
    { { 0x1e5d7713,0x0898bf5a,0x179276ab,0x130bdceb,0x1b26109b,0x1e27e3a7,
        0x1838cbd6,0x1a29eeb7,0x005cf908 },
      { 0x0e657b12,0x1021a884,0x1bb6799d,0x08434b72,0x0ccc2bfd,0x1a8fc4b8,
        0x138838a7,0x080c1e01,0x00a698ba } },
    /* 134 */
    { { 0x0f748fec,0x1ed8b437,0x074b3e5c,0x0eab44fd,0x05effe6e,0x12a26713,
        0x16358c2d,0x114f5d75,0x00b142ef },
      { 0x17d5770a,0x098d7cf8,0x0cd04beb,0x1e76ce59,0x159de66a,0x068def99,
        0x01d5af58,0x12cb0a2a,0x00d1896a } },
    /* 135 */
    { { 0x13c41c08,0x02cabd59,0x1a38b87b,0x1d2958a8,0x12f6c87d,0x15b9d623,
        0x08e46205,0x016f303b,0x00267b0e },
      { 0x0e62b988,0x12aa72ec,0x1b4879db,0x1b8eaa22,0x06f99d8d,0x1d781e95,
        0x0e4d1843,0x0f542232,0x00b54e28 } },
    /* 136 */
    { { 0x178a876b,0x100915a8,0x14412d02,0x1f2dfe10,0x09f7651f,0x18d58a79,
        0x1398142c,0x116bf0fa,0x0084abb2 },
      { 0x0270790a,0x0f6a1cfc,0x18fd1af5,0x196b3b0b,0x022122d6,0x0e0db60f,
        0x1901d7d5,0x0ce2ecaa,0x00e5436f } },
    /* 137 */
    { { 0x0286e8d5,0x1fc812f1,0x1114ef94,0x192b690c,0x0e3a0353,0x1adef204,
        0x067b60cb,0x116b739d,0x000404f6 },
      { 0x0781e8e5,0x1699def5,0x0f0bd6f2,0x1ea0302c,0x1caa33cd,0x14b0008c,
        0x1c055d5d,0x1be15838,0x003a4263 } },
    /* 138 */
    { { 0x1aeb596d,0x14b2f664,0x0f24ad30,0x1407ce04,0x1396101e,0x1a5b1700,
        0x0d9d1c12,0x07f20bd4,0x000ca8fd },
      { 0x151b2b61,0x1291d212,0x03f341a4,0x0f513872,0x0a63e1eb,0x095f01c9,
        0x10cf9fc7,0x0c89bb61,0x0096dca2 } },
    /* 139 */
    { { 0x187510af,0x01dda1d1,0x08da8048,0x1fd55153,0x10378846,0x0bb817ca,
        0x077348e9,0x024755ab,0x004363e2 },
      { 0x00246a47,0x121d0e3a,0x17749372,0x0571a5ca,0x1af96b36,0x03022ec7,
        0x0313e6c2,0x0b9b1773,0x00840e11 } },
    /* 140 */
    { { 0x1023e8a7,0x09102f10,0x171e82fc,0x11519bb1,0x05ddfc80,0x11390b1d,
        0x1b538a4a,0x17a61bda,0x005e0d6a },
      { 0x1cfc0f64,0x1d390e13,0x157b6201,0x1d803a1c,0x19db242e,0x1f7c8e8f,
        0x09689a9e,0x1e8528b4,0x007dea48 } },
    /* 141 */
    { { 0x05060a81,0x1efb78e7,0x1e55856a,0x1f38e5f1,0x0268be79,0x162a0356,
        0x1b473f4d,0x17dd7fa2,0x00abc2a2 },
      { 0x13e2eac7,0x16337c8e,0x174119a2,0x0174c7a5,0x0d31b6f1,0x11bb8141,
        0x1f059e43,0x128d8fdd,0x004ea353 } },
    /* 142 */
    { { 0x1266309d,0x0c517c6a,0x05168fbb,0x038d8103,0x05dc10a5,0x1a2d2bc6,
        0x1f0f3b2b,0x1123929f,0x003a76e6 },
      { 0x1d7b0d0f,0x15674523,0x161297e6,0x159d2d1e,0x17fbe963,0x06392734,
        0x1191468c,0x0148cbcc,0x008212a1 } },
    /* 143 */
    { { 0x0fab8caa,0x1be30e1e,0x0508e43b,0x171d081c,0x133ca18e,0x1fb3bf4b,
        0x05933477,0x0e2b3396,0x00aa7cab },
      { 0x1c837bd1,0x17e4939d,0x1abd75c0,0x080fa186,0x1da49c06,0x09497a11,
        0x1f0c5d88,0x0e7fc0c2,0x0040e380 } },
    /* 144 */
    { { 0x07bf9b7c,0x07c04125,0x0f8c343d,0x1a46407f,0x19ce3365,0x09904be7,
        0x149afef9,0x001660aa,0x00e36047 },
      { 0x0cc6c2c7,0x0e5cc88b,0x132fb993,0x106e1174,0x0d9ec726,0x0a1a31bd,
        0x057f737b,0x0ef47bdc,0x006542d6 } },
    /* 145 */
    { { 0x1b6c377a,0x1995b683,0x0d122f8f,0x00708f20,0x08af76cb,0x09d4106d,
        0x1c875bf7,0x1dc1376d,0x00a6534a },
      { 0x1035facf,0x050bc068,0x12d1f98c,0x0ab4673b,0x1f39335e,0x07f0e223,
        0x1c89ba94,0x05fb935d,0x00f3cb67 } },
    /* 146 */
    { { 0x1b55fd83,0x19b8cff1,0x1777443a,0x0f48d90e,0x0a784e0d,0x0fd482e7,
        0x039cceb2,0x05d55d0e,0x007cafaa },
      { 0x1d53b338,0x1c0a6820,0x01f9b1a6,0x198141df,0x12b0fe0a,0x088408b3,
        0x08bbee4f,0x183737aa,0x000aab13 } },
    /* 147 */
    { { 0x12681297,0x0e6713c6,0x02551ab7,0x0a1d636a,0x1aaf2cb3,0x18b9bb30,
        0x0ba4b710,0x00508e02,0x004b91a6 },
      { 0x12f8ddcf,0x07f884ab,0x0446bd37,0x17ec3d35,0x0430e08e,0x1b0561b9,
        0x12ad23d0,0x0a6e4643,0x0049534c } },
    /* 148 */
    { { 0x107b7e9d,0x1efbeb8f,0x13545be0,0x11df4627,0x07ee3a47,0x1325b602,
        0x17b9e3bc,0x09facb58,0x00caf46c },
      { 0x12aa8266,0x026863bc,0x0da12ee8,0x08a8cd22,0x116b0edf,0x08b45725,
        0x1c3d5b99,0x0ae098ce,0x0014ce9e } },
    /* 149 */
    { { 0x165e8f91,0x0a22f1f4,0x03c924a6,0x19437596,0x0a0a0d3a,0x0387c864,
        0x09c74c73,0x14a7c993,0x001bb708 },
      { 0x158bdd7a,0x0e54f34a,0x0289ac75,0x140a1003,0x0f1ec734,0x1538a64e,
        0x040ac24e,0x1e5b4600,0x00f9d126 } },
    /* 150 */
    { { 0x0ff9563e,0x04de53d5,0x0645281d,0x0ef5fd69,0x11671dd0,0x0188dfaf,
        0x11a789e8,0x172e53d9,0x00807afc },
      { 0x09b08b77,0x1c5499be,0x0f1f8e1f,0x074f0a88,0x1d8ba86c,0x1d2ca3b7,
        0x163217eb,0x1a2cad19,0x00751adc } },
    /* 151 */
    { { 0x10715c0d,0x1751c5a0,0x1da5fde2,0x07d4e31e,0x1f06dd11,0x158a49fd,
        0x10fd997a,0x0d04a6ee,0x0029ec44 },
      { 0x150bebbc,0x0ca38ce5,0x1415088f,0x1dcb7fc8,0x1edb1399,0x0d9d4696,
        0x1df64335,0x1c725480,0x00ff9370 } },
    /* 152 */
    { { 0x06b75b65,0x0d16b4de,0x19947156,0x11f1aa4c,0x1d7d2418,0x199f1ef4,
        0x0068a2a7,0x1174553a,0x00977647 },
      { 0x129af2c7,0x0293116c,0x1a4248e2,0x1ebada9c,0x051e9334,0x03f2d44d,
        0x0beb39b3,0x07f585f0,0x0074a631 } },
    /* 153 */
    { { 0x175f079c,0x17a6feed,0x18dbeeec,0x00f92a31,0x136dd85b,0x1e7873e6,
        0x18f46db3,0x02a1fe90,0x00ab75be },
      { 0x173fc9b7,0x0d9b3e00,0x1653f420,0x14e841a4,0x11236b90,0x1f81e204,
        0x07d857f6,0x05c1688b,0x004ebeac } },
    /* 154 */
    { { 0x1c9f2c53,0x1b62ff3a,0x0ba5047a,0x0440231d,0x0c5d8d25,0x1b19fcad,
        0x1ff32221,0x0f658375,0x00df9988 },
      { 0x050aaecb,0x1bc77694,0x15a89cae,0x12303603,0x1bcac9d4,0x0a88d8e6,
        0x01625e37,0x14eef3e8,0x0027b040 } },
    /* 155 */
    { { 0x173b2eb2,0x0202edbf,0x06c84624,0x1f0a111c,0x0327ee0d,0x18a92cb1,
        0x0fd5406d,0x06fc99f4,0x00b393dd },
      { 0x1fd75165,0x091873d9,0x14cd5528,0x06898579,0x15022d66,0x18df07bd,
        0x1065b0db,0x025a08c6,0x0009588c } },
    /* 156 */
    { { 0x02601c3b,0x043049f8,0x170cd7f8,0x04a5f19e,0x0ff28fb0,0x194044a5,
        0x122e5573,0x153b73ec,0x0081c879 },
      { 0x06f56c51,0x007343e6,0x05d86301,0x08e2d27e,0x1353bfed,0x0520c82c,
        0x0f1113e2,0x1eabf823,0x00fa0d48 } },
    /* 157 */
    { { 0x01608e4d,0x0370e4ef,0x00a08b2f,0x1bb4226b,0x0c2d7010,0x0ee08abf,
        0x1f5bdadf,0x0ad6d46c,0x008ea0e1 },
      { 0x0383b3b4,0x1aa70179,0x007d4f28,0x0cd7287e,0x03ca5699,0x119596f0,
        0x16b13fd9,0x049f4016,0x003f5ab9 } },
    /* 158 */
    { { 0x19739efb,0x1bdd86ca,0x1afb034c,0x0361e9cf,0x067d1c75,0x16eb208d,
        0x15b8b694,0x10e56e84,0x008bc768 },
      { 0x02d3d253,0x0df1db94,0x035de7e9,0x0cf343eb,0x167bba9f,0x00b470b3,
        0x0d3e872b,0x120c1f9e,0x00b386f1 } },
    /* 159 */
    { { 0x0fedcfc2,0x0f9e09a9,0x1e2bc34c,0x0d7ec4c5,0x088c2539,0x1a7572b9,
        0x1136680a,0x1ee360d3,0x004cb460 },
      { 0x1b8095ea,0x133da69a,0x101d80eb,0x17f0b2df,0x0a16592b,0x0fb35b0a,
        0x088f851d,0x0112bdea,0x0052c0d5 } },
    /* 160 */
    { { 0x15339848,0x18e10870,0x1de32348,0x1451d0e0,0x0e170e87,0x1330b4ab,
        0x102e7477,0x07057613,0x004ac3c9 },
      { 0x0998987d,0x0df02a8b,0x027d3586,0x06ed895c,0x1933d8b2,0x1bb28d1f,
        0x17d07782,0x18fc72e0,0x00380d94 } },
    /* 161 */
    { { 0x01542e75,0x0d1aad54,0x006e6dc0,0x0e4943dc,0x1708796c,0x14bbb126,
        0x1ebdace8,0x0e3bc4c6,0x002ce3e1 },
      { 0x15d5bc1a,0x1f7f5a4f,0x1df8ad73,0x0ac0fc4e,0x1756ca65,0x1617ca89,
        0x19353faa,0x0a416c49,0x002e6cd8 } },
    /* 162 */
    { { 0x0c31c31d,0x142caa5c,0x1c86830d,0x067a00b7,0x19ec9685,0x11373ae3,
        0x15502f5d,0x08e858d3,0x00ca1775 },
      { 0x16d2dbb2,0x0376d7ff,0x12a74633,0x1b197a2e,0x178e8fd0,0x03c9d522,
        0x139a1d7a,0x02739565,0x00a976a7 } },
    /* 163 */
    { { 0x13fb353d,0x1328f8dc,0x1f3e9c82,0x195716af,0x15281d75,0x07d398d8,
        0x0666aa23,0x02e143e9,0x008720a7 },
      { 0x093e1b90,0x01f469bb,0x1db7f0e3,0x0bb8162d,0x08742d34,0x08055a95,
        0x04f23aa3,0x0538ed31,0x009719ef } },
    /* 164 */
    { { 0x18e35909,0x10776c6a,0x177045a0,0x0db1b867,0x05026936,0x0ce83710,
        0x13075fe6,0x0edc2ae0,0x00a50729 },
      { 0x04e70b2e,0x0151bf56,0x042aa280,0x19ecaed1,0x12a5c84d,0x1f8c322d,
        0x1c9735c6,0x13bef6ee,0x0099389c } },
    /* 165 */
    { { 0x1ada7a4b,0x1c604793,0x0e24d988,0x1d3a07fa,0x1512c3ab,0x1744bb37,
        0x0b91ad9c,0x15440590,0x00a88806 },
      { 0x1380184e,0x10102256,0x1aa2e159,0x16f18824,0x04f17a8c,0x186056c2,
        0x13f9e759,0x1f68e71b,0x000043bf } },
    /* 166 */
    { { 0x16d5192e,0x0acdaee1,0x042cabe3,0x110ba68b,0x01781acf,0x168508b0,
        0x019a0d59,0x00374d89,0x0052f3ef },
      { 0x0edcb64d,0x0c339950,0x1a0de7ce,0x10584700,0x0f3090a4,0x12fd3820,
        0x19d45b2f,0x1133de4f,0x003296bd } },
    /* 167 */
    { { 0x054d81d7,0x1b55d44a,0x1ae6cf11,0x1bcfdea3,0x179869ea,0x10e6c0e2,
        0x07a58668,0x17f5dcae,0x003b90fe },
      { 0x1496f7cb,0x1c9811f2,0x0d46f124,0x1c83b0ff,0x0b5ce55b,0x0ea44cdf,
        0x0c600fc7,0x13b3f021,0x006e8806 } },
    /* 168 */
    { { 0x143ea1db,0x11bd588d,0x1674a4b3,0x1fe352a4,0x0f1860a7,0x0110c7c2,
        0x144e146c,0x1d5bdf55,0x00a7222b },
      { 0x0b0a9144,0x1563c761,0x1e967168,0x0480a3e5,0x1ce385a0,0x1652b0a3,
        0x1a424747,0x04778558,0x00be94d5 } },
    /* 169 */
    { { 0x0b226ce7,0x17a4a2f0,0x1fa2dc1c,0x1fae8f2c,0x0c63eb8a,0x0378c2d3,
        0x1d9bb7a9,0x1fd37d18,0x007782de },
      { 0x1db38626,0x10695521,0x1d9eb45d,0x15cf0eed,0x19cdb460,0x037e2a24,
        0x192cd06e,0x0cf45125,0x00038385 } },
    /* 170 */
    { { 0x19ec1a0f,0x0c6d77eb,0x0ce725cb,0x19adfb9d,0x01a953bb,0x0ffe2c7b,
        0x1083d55d,0x1895bef6,0x00dbd986 },
      { 0x15f39eb7,0x0d5440a0,0x0365db20,0x05f9eb73,0x1717d6ee,0x03aee797,
        0x0f415195,0x188d0c17,0x008e24d3 } },
    /* 171 */
    { { 0x1a587390,0x04ec72a4,0x0fb1621d,0x16329e19,0x183c612b,0x1ed2592c,
        0x1f211b81,0x18880f75,0x00541a99 },
      { 0x024c8842,0x1920b493,0x1b017ff6,0x098255b0,0x1cf62604,0x0a5a27bf,
        0x17471674,0x093eafa6,0x00c0092c } },
    /* 172 */
    { { 0x1f2e61ef,0x1e63ae1e,0x06cd72b4,0x1083905c,0x129f47e8,0x1868c84f,
        0x113718b4,0x068e50d2,0x0075e406 },
      { 0x1bc237d0,0x1ea0fe2d,0x13c07279,0x06f7e1d8,0x1d534c95,0x0d0b1415,
        0x161a4714,0x0b18f090,0x005b7cb6 } },
    /* 173 */
    { { 0x0a28ead1,0x12538424,0x0ed1fda5,0x1b8a11fa,0x05b39802,0x1fe8bb3f,
        0x1e866b92,0x1751be12,0x007ae13e },
      { 0x0add384e,0x090b77c7,0x0cbfc1bf,0x0345b36d,0x1b5f3036,0x0c3c25e6,
        0x0ff4812e,0x0e9c551c,0x00787d80 } },
    /* 174 */
    { { 0x157fbb1c,0x0f12eb5b,0x08077af1,0x17bb6594,0x033ffe47,0x14d1b691,
        0x12112957,0x0333de50,0x005c2228 },
      { 0x08315250,0x19ea542c,0x1c25f05d,0x04345704,0x1d33f21b,0x0750ef7a,
        0x0ac2adf1,0x15775e1e,0x00e45d37 } },
    /* 175 */
    { { 0x08511c8a,0x16f8f1a1,0x129b34f4,0x0453917b,0x039a7ebb,0x18d3b13e,
        0x074d5e29,0x04509bf7,0x00ed7bc1 },
      { 0x13dea561,0x191536fc,0x03c3b473,0x07e31ba9,0x123e8544,0x10a02dd6,
        0x149f62e1,0x1928b94d,0x00aac97c } },
    /* 176 */
    { { 0x016bd00a,0x1aa753a5,0x102f307a,0x13d35beb,0x1fc06d83,0x1bf88fcd,
        0x113824ae,0x16622c7b,0x00318f97 },
      { 0x030d7138,0x06062df6,0x10c0883b,0x11be4757,0x0360644e,0x0b97d811,
        0x1d34aede,0x1433509f,0x00fa41fa } },
    /* 177 */
    { { 0x06642269,0x0016cba5,0x0de0ef51,0x10299d37,0x1e60bc81,0x1c723ca0,
        0x0788e634,0x0583a4dd,0x0038bb6b },
      { 0x0a577f87,0x1272512b,0x047f8731,0x05a4a7b8,0x007288b5,0x155fb114,
        0x0697fccd,0x00b9cec0,0x0094dd09 } },
    /* 178 */
    { { 0x1e93f92a,0x0b67bee6,0x0d7cc545,0x06679713,0x1e750a01,0x06fce4ca,
        0x0ba40901,0x0cfa4b85,0x00920778 },
      { 0x0bf39d44,0x1238f008,0x0ed4f5f8,0x1920412d,0x03d8f5f2,0x1bd9ae4e,
        0x0d453112,0x117a537d,0x0081e842 } },
    /* 179 */
    { { 0x0477199f,0x0ece15d6,0x17b3765b,0x11dddcd6,0x0fd0e8cb,0x0d9ff720,
        0x12c62bdf,0x0c5b77f4,0x001b94ab },
      { 0x0e47f143,0x0786c59e,0x1d1858d1,0x0c47f8c7,0x1938351e,0x1387e62c,
        0x03bbc63c,0x0500aab2,0x0006a38e } },
    /* 180 */
    { { 0x13355b49,0x12d809cd,0x1afe66cb,0x04cac169,0x1f3dc20e,0x1d35e934,
        0x13e3023f,0x04107b3a,0x00a7b36c },
      { 0x1b3e8830,0x068ae1d0,0x07e702d9,0x19d5c351,0x16930d5f,0x12517168,
        0x08833fbb,0x16945045,0x00be54c6 } },
    /* 181 */
    { { 0x0d91167c,0x166d9efc,0x099897b5,0x187ef3cf,0x0c7f4517,0x12479a35,
        0x0aedc415,0x157d5c04,0x00bf30a5 },
      { 0x13828a68,0x13bc2df4,0x0fbc0da3,0x038664fe,0x146b2516,0x0ff5ac90,
        0x04eb846d,0x1bc4e65a,0x00d1c820 } },
    /* 182 */
    { { 0x1038b363,0x01f09a3c,0x01794641,0x023ea8d6,0x0cad158c,0x1d5f3013,
        0x168d3f95,0x1dad1431,0x00b7d17b },
      { 0x029c2559,0x0652c48f,0x1fff6111,0x1406ecb7,0x069484f7,0x1257ba72,
        0x11912637,0x0bcc8259,0x003997fd } },
    /* 183 */
    { { 0x0bd61507,0x103a3414,0x09934abc,0x0265aa69,0x015e329e,0x0fd84545,
        0x0fa3ffb7,0x05278d82,0x000eeb89 },
      { 0x07e259f8,0x0db4d1f5,0x0f9f99fa,0x1b6fcda2,0x1a685ce1,0x0c7b568f,
        0x1bbc9dcc,0x1f192456,0x00228916 } },
    /* 184 */
    { { 0x0a12ab5b,0x0cd712d8,0x1ef04da5,0x022e3f2a,0x02b0ccc1,0x014f68b7,
        0x05fa0161,0x03add261,0x00ec05ad },
      { 0x0c3f3708,0x0bdd2df5,0x0d675dc5,0x15f26a61,0x034e531b,0x091b88c1,
        0x0cdd1ed5,0x0acffe23,0x007d3141 } },
    /* 185 */
    { { 0x16dfefab,0x1ece02e7,0x0cddc1de,0x1e44d1b9,0x0bb95be2,0x16cb9d1c,
        0x1e8f94fa,0x1f93783a,0x00e9ce66 },
      { 0x0f6a02a1,0x0d50abb3,0x19803b5d,0x010fbec1,0x1c1b938c,0x1f9a3466,
        0x1947e251,0x002e4500,0x00d9650b } },
    /* 186 */
    { { 0x1a057e60,0x025a6252,0x1bc97914,0x19877d1b,0x1ccbdcbc,0x19040be0,
        0x1e8a98d4,0x135009d6,0x0014d669 },
      { 0x1b1f411a,0x045420ae,0x035da70b,0x175e17f0,0x177ad09f,0x17c80e17,
        0x062ad37b,0x0821a86b,0x006f4c68 } },
    /* 187 */
    { { 0x16c24a96,0x1936fa74,0x0f6668e1,0x1b790bf9,0x0e30a534,0x17794595,
        0x0aecf119,0x1fac2313,0x004c4350 },
      { 0x1855b8da,0x0b3fb8b7,0x0f0e284a,0x0847288c,0x1334341a,0x0a09f574,
        0x02d70df8,0x084b4623,0x00a726d2 } },
    /* 188 */
    { { 0x148c1086,0x17359f74,0x14e8b876,0x1ca07b97,0x022f3f1d,0x169f81e8,
        0x0e48fcd7,0x10598d9e,0x0013639e },
      { 0x0dafaa86,0x1649c7de,0x15289626,0x178bf64c,0x11329f45,0x19372282,
        0x168c658e,0x1c383466,0x00ca9365 } },
    /* 189 */
    { { 0x0c3b2d20,0x10ad63aa,0x138906cd,0x14a82f20,0x1071d742,0x10e2664e,
        0x0a96c214,0x0692e16e,0x009ce29c },
      { 0x0d3e0ad6,0x0640fb9b,0x1e10d323,0x01b53de5,0x062d9806,0x0e8d3674,
        0x1e60d7b4,0x1af56855,0x0048c4ab } },
    /* 190 */
    { { 0x00c7485a,0x110d8662,0x09d36ff4,0x08ab77ca,0x1d2e8ead,0x1b4c4931,
        0x0f2d24f1,0x065ecf66,0x0078017c },
      { 0x130cb5ee,0x0e9abb4c,0x1023b4ae,0x029d2818,0x11a4dc0d,0x1faa9397,
        0x1013e2de,0x0a9bcb83,0x0053cd04 } },
    /* 191 */
    { { 0x1d28ccac,0x06ac2fd2,0x16dd1baf,0x047cac00,0x123aa5f8,0x1850e680,
        0x0a3df1e7,0x183a7aff,0x00eea465 },
      { 0x0551803b,0x00832cf8,0x19abdc1e,0x16b33ef9,0x08e706c0,0x13b81494,
        0x064d0656,0x148f5cd2,0x001b6e42 } },
    /* 192 */
    { { 0x167d04c3,0x14049be7,0x1bae044b,0x0257c513,0x14d601e3,0x0c43c92c,
        0x14f55ad7,0x02830ff7,0x000224da },
      { 0x0c5fe36f,0x1d5dc318,0x1d47d7e1,0x1e78c09d,0x029ec580,0x18dfd9da,
        0x1cce593e,0x1e0857ff,0x0060838e } },
    /* 193 */
    { { 0x1e0bbe99,0x19659793,0x0a8e7b90,0x1489e609,0x139037bd,0x1e3d4fd4,
        0x190d7d25,0x0045a662,0x00636eb2 },
      { 0x13ae00aa,0x07e8730c,0x0b9b4bff,0x1401fc63,0x1901c875,0x0c514fc9,
        0x0eb3d0d9,0x16c72431,0x008844ee } },
    /* 194 */
    { { 0x0b3bae58,0x0a0b8e93,0x18e7cf84,0x07bee22f,0x0eada7db,0x1e3fc0d4,
        0x027b34de,0x1b8a3f6f,0x0027ba83 },
      { 0x1bf54de5,0x1efa1cff,0x1f869c69,0x0e06176b,0x17a48727,0x071aed94,
        0x12ad0bba,0x0690fe74,0x00adb62d } },
    /* 195 */
    { { 0x0175df2a,0x188b4515,0x030cba66,0x15409ec3,0x10916082,0x19738a35,
        0x02cb2793,0x0ecebcf9,0x00b990fd },
      { 0x0df37313,0x014ecb5a,0x0d01e242,0x00aaf3a1,0x077111c2,0x17253c04,
        0x06359b26,0x1f29a21a,0x0081707e } },
    /* 196 */
    { { 0x03d6ff96,0x1ebe5590,0x010cd825,0x0a37f81b,0x0db4b5b8,0x11e26821,
        0x09709a20,0x1d5ab515,0x003792da },
      { 0x141afa0b,0x140c432c,0x160d9c54,0x13ce8285,0x0e0a7f3e,0x1293adf2,
        0x06e85f20,0x0bd29600,0x005abd63 } },
    /* 197 */
    { { 0x0ac4927c,0x13fd4270,0x1233c8dc,0x10c06b4f,0x0a0dfe38,0x0af5256e,
        0x184292f3,0x04308d56,0x005995bf },
      { 0x029dfa33,0x087c305c,0x03f062fa,0x1fc55d2b,0x10366caa,0x17a23c31,
        0x047a6cee,0x145a9068,0x0044c32c } },
    /* 198 */
    { { 0x040ed80c,0x1a54bf8f,0x14b2a0a9,0x07196263,0x16ad95f9,0x0925be16,
        0x15314fc8,0x1f701054,0x001f2162 },
      { 0x120b173e,0x1233e62b,0x17c4be5f,0x114ccc10,0x165dc40e,0x0107264e,
        0x1f2633af,0x05787d20,0x008f1d40 } },
    /* 199 */
    { { 0x1bc4058a,0x1ac97ce7,0x0bd59c13,0x1c296c52,0x18c57b15,0x1f1bde0e,
        0x0fe71573,0x08724ddb,0x00b1980f },
      { 0x12c76b09,0x0619f049,0x0c1fde26,0x0a4f3a67,0x1b4611df,0x156a431d,
        0x1915bc23,0x1366e891,0x002828ad } },
    /* 200 */
    { { 0x04cf4ac5,0x0b391626,0x1992beda,0x18347fbb,0x10832f5a,0x1d517044,
        0x0e401546,0x04eb4296,0x004973f1 },
      { 0x122eac5d,0x0cec19a9,0x166d5a39,0x0fddea17,0x083935e0,0x1907d12c,
        0x0b1eacd9,0x1a1b62d1,0x006dac8e } },
    /* 201 */
    { { 0x0da835ef,0x1daa2d77,0x043b547d,0x0227a43a,0x01b094aa,0x12f009ba,
        0x19300d69,0x0b24173b,0x004b23ef },
      { 0x1c4c7341,0x015db401,0x162f0dfa,0x0ee0da7e,0x03ee8d45,0x1c31d28f,
        0x0939cd49,0x069bbe93,0x004dd715 } },
    /* 202 */
    { { 0x15476cd9,0x1ca23394,0x069c96ef,0x1a0e5fc6,0x167e0648,0x045c7e25,
        0x16ec5107,0x0005e949,0x00fd3170 },
      { 0x0995d0e1,0x05a1ffa4,0x1dca6a87,0x0d2ba21d,0x1898276e,0x1cbb20bc,
        0x0d978357,0x1192ad3e,0x0014fac5 } },
    /* 203 */
    { { 0x1312ae18,0x0cd0032f,0x124ff26b,0x0b1b81f9,0x12846519,0x0120453e,
        0x09436685,0x0a26d57b,0x00ed7c76 },
      { 0x05d4abbc,0x113878d1,0x0844fa91,0x1bb1e7e3,0x1952f9b5,0x183aada8,
        0x1d4f1826,0x1ee9a5d3,0x00fefcb7 } },
    /* 204 */
    { { 0x1a119185,0x084a4bd5,0x1116e92f,0x1d186155,0x01179d54,0x1cef5529,
        0x002d2491,0x0fd0fc1b,0x001801a5 },
      { 0x1cafffb0,0x19e9fc6f,0x09549001,0x0678175c,0x1dfbc6cf,0x1b1dadaf,
        0x0191e075,0x03c3d5a2,0x009f8fc1 } },
    /* 205 */
    { { 0x1e69544c,0x0c1d0b8a,0x12de04c5,0x1f0acfe0,0x04c320ea,0x147e93c5,
        0x06a4788a,0x13a7a74d,0x00a9d380 },
      { 0x19a2da3b,0x1b616162,0x057211e4,0x1979ec31,0x1086938c,0x122731ea,
        0x1bdd7994,0x15dc22f1,0x003006b9 } },
    /* 206 */
    { { 0x09eead28,0x1d8f9586,0x1d37ef02,0x1ec6bb13,0x089397ee,0x0bfed967,
        0x1d841d1d,0x1ae8bf1e,0x000ab85f },
      { 0x1e5b4549,0x06d3e499,0x048bc87b,0x0576b92f,0x180404be,0x093a5a1d,
        0x0b089868,0x0ea23d28,0x00b122d6 } },
    /* 207 */
    { { 0x06a5ae7a,0x1f303df3,0x0b72f8ce,0x0e07f4ed,0x0e5c501e,0x0180a75b,
        0x0bb2be41,0x18212fb7,0x009f599d },
      { 0x0ff250ed,0x0badb8c0,0x0688371b,0x122ae869,0x027a38eb,0x02d20859,
        0x0de10958,0x1c114529,0x007d5528 } },
    /* 208 */
    { { 0x00c26def,0x07ac7b31,0x0acb47bc,0x0b0bd4b0,0x03881025,0x0bcd80e7,
        0x1cc3ef9f,0x002607e2,0x0028ccea },
      { 0x19644ba5,0x0ed5e68b,0x1ffc2e34,0x0c87d00d,0x1e17b1fc,0x1b7e3359,
        0x0efe9829,0x09143a02,0x00c18baf } },
    /* 209 */
    { { 0x1dc4216d,0x0731c642,0x1850ab0d,0x0020ce40,0x1064a00c,0x10b8cafa,
        0x05af514e,0x13b6f52b,0x009def80 },
      { 0x07ab8d2c,0x0f432173,0x0de8ad90,0x080866c4,0x0218bb42,0x1536b262,
        0x1395f541,0x160d1011,0x000357f8 } },
    /* 210 */
    { { 0x0cd2cc88,0x14edf322,0x0e3ce763,0x03851be1,0x0a0c8cc6,0x0c3a6698,
        0x021d28c2,0x1ba36913,0x00e4a01a },
      { 0x157cd8f9,0x168f7567,0x1653120b,0x0cfa7d7a,0x0f7871b7,0x0e38bde9,
        0x10c29ca5,0x0f39c219,0x00466d7d } },
    /* 211 */
    { { 0x1dada2c7,0x1e98c494,0x06a89f51,0x014d871f,0x059e14fa,0x1e944105,
        0x146a4393,0x0448a3d5,0x00c672a5 },
      { 0x1d86b655,0x0303e642,0x0b52bc4c,0x06ba77f3,0x172a6f02,0x03402b88,
        0x144e6682,0x1f5e54ce,0x005e3d64 } },
    /* 212 */
    { { 0x1b3b4416,0x1320863c,0x0c9b666a,0x1f9f0bd5,0x16a74cd8,0x1ba56db2,
        0x0bf17aff,0x12bd71c8,0x006c8a7a },
      { 0x102a63bd,0x06305d3d,0x03c011c4,0x1e460717,0x190b06b2,0x1b9c1896,
        0x0a4631b0,0x0455b059,0x00348ae4 } },
    /* 213 */
    { { 0x1ccda2fb,0x1a3a331a,0x01c9b49f,0x1995431c,0x11f2022a,0x1bc12495,
        0x14ba16b7,0x1c1b3de5,0x00c1074d },
      { 0x0e9a65b3,0x079e7225,0x15c546ff,0x03c9580b,0x09788fd7,0x0fa86735,
        0x1ff351c4,0x1b793ca9,0x00fbadfb } },
    /* 214 */
    { { 0x00a99363,0x189f8e69,0x1c89dd45,0x0acb1ed9,0x159b2b91,0x1ae69269,
        0x1f365a05,0x16906e2d,0x00b7f976 },
      { 0x1d6dbf74,0x1ac7126a,0x10ebcd95,0x0775fae3,0x1dfe38d2,0x1bb00121,
        0x001523d1,0x05d95f99,0x00f4d41b } },
    /* 215 */
    { { 0x1dabd48d,0x0f8e7947,0x101e2914,0x037c6c65,0x146e9ce8,0x14ba08b8,
        0x1c41ab38,0x1d5c02c1,0x00180824 },
      { 0x06e58358,0x1c3b4c5b,0x1b28d600,0x0d0ea59c,0x1e6c5635,0x071a2f20,
        0x149608e0,0x073079ed,0x0067e5f6 } },
    /* 216 */
    { { 0x0f4899ef,0x04e65c6e,0x0ed1303e,0x002be13d,0x18ec9949,0x093b592c,
        0x1f1951be,0x13409823,0x009fef78 },
      { 0x13d2a071,0x09b3f67a,0x1466c25b,0x1c34ff48,0x02eefb10,0x1fd8308f,
        0x188329ac,0x10353389,0x00bc80c1 } },
    /* 217 */
    { { 0x05eb82e6,0x1929b7c7,0x1b2e4825,0x109f8fea,0x1da5e1a4,0x10b8a85a,
        0x1c431e38,0x0c53f19b,0x0049270e },
      { 0x0a6b50ad,0x11cdbddf,0x0e23ff06,0x05098344,0x1197b9a0,0x158bc083,
        0x1dfd500f,0x1f2c26e5,0x00d2ee52 } },
    /* 218 */
    { { 0x08e0362a,0x1be6942c,0x09765374,0x1f514f1f,0x0a526442,0x1b72d21a,
        0x1ccebfe0,0x17dcb576,0x00dfb478 },
      { 0x073eede6,0x08f8e73b,0x16cbc12a,0x1215a856,0x0da2fa53,0x1bdfaa98,
        0x1ce9799b,0x16811be8,0x00d9a140 } },
    /* 219 */
    { { 0x0e8ea498,0x10110dab,0x18fb8243,0x08f0526a,0x12ade623,0x01c899ae,
        0x0c6b81ae,0x11ac47e9,0x00760c05 },
      { 0x0198aa79,0x1c4dac66,0x1eae9fc2,0x1121a5e0,0x0556af74,0x00887ef1,
        0x10253881,0x05b1e320,0x00714198 } },
    /* 220 */
    { { 0x0d4b0f45,0x1850719a,0x0aa5385b,0x10167072,0x01d5ed92,0x126359e3,
        0x191cebcc,0x19d13aa9,0x003af9d1 },
      { 0x00930371,0x0c7bcc09,0x105c25ff,0x04cc9843,0x0309beda,0x02ee6e21,
        0x17583a55,0x186e72af,0x00b1f815 } },
    /* 221 */
    { { 0x09fec44a,0x07d53c74,0x0a932be1,0x055c8e79,0x0a624c8c,0x003ee0db,
        0x0149a472,0x0282a87e,0x00a41aed },
      { 0x1d5ffe04,0x121a9ccb,0x16db8810,0x1965bec4,0x177758ba,0x105f43c0,
        0x03be1759,0x1bb0df6c,0x00d6e9c1 } },
    /* 222 */
    { { 0x06853264,0x15174bf6,0x0c1282ce,0x0a676fc4,0x0e9be771,0x15dbdc75,
        0x03086e44,0x0215d37f,0x009c9c6e },
      { 0x0030b74c,0x1184d2cf,0x18c7a428,0x0e929ad4,0x179f24ed,0x0591d24d,
        0x06da27d1,0x12c81f4c,0x00566bd5 } },
    /* 223 */
    { { 0x018061f3,0x136008c6,0x00ff1c01,0x164ba6f9,0x13245190,0x04701393,
        0x117bc17f,0x121ea4a6,0x00cf2c73 },
      { 0x10eb30cf,0x04de75a0,0x1ddc0ea8,0x05d7741a,0x1f255cfd,0x021d0a87,
        0x05e7a10b,0x0ab15441,0x0002f517 } },
    /* 224 */
    { { 0x0ddb7d07,0x0b77bca5,0x1155400e,0x1f8e8448,0x0a3ce0b4,0x075663c5,
        0x05f7ebfe,0x14bd1a9b,0x0014e9ad },
      { 0x0f7079e2,0x15240509,0x0c2003b6,0x15479bc9,0x0157d45b,0x0f16bc1c,
        0x0ba005d9,0x1571d3b3,0x00a0ad4f } },
    /* 225 */
    { { 0x0a653618,0x1fdbb10a,0x1aaa97c2,0x05027863,0x09d5e187,0x139ba24a,
        0x1478554f,0x170dcadd,0x00bcd530 },
      { 0x12e9c47b,0x14df4299,0x00166ac5,0x0eedfd6a,0x1fbb4dc2,0x0bb08c95,
        0x107736ea,0x19ed2f26,0x00909283 } },
    /* 226 */
    { { 0x16e81a13,0x1d801923,0x05c48e59,0x1c3532c4,0x019d69be,0x1b0de997,
        0x126823b4,0x19359c2a,0x0035eeb7 },
      { 0x1e4e5bdc,0x140572d3,0x13bb1b84,0x1a59a76d,0x06bc12dc,0x11263713,
        0x01914b90,0x1e88915d,0x009a8b2c } },
    /* 227 */
    { { 0x09d03b59,0x1238df90,0x16bcaafd,0x1cc5476c,0x1eec9c90,0x18b475ea,
        0x0de7fdff,0x1e9a8922,0x006bdb60 },
      { 0x0a55bc30,0x16d7f5e4,0x025ff836,0x1d5a2c20,0x03bddc79,0x0ba0a60f,
        0x02a50b86,0x1fb29741,0x0001ec3c } },
    /* 228 */
    { { 0x1c9485c2,0x1313bf5e,0x1ec431ee,0x1934f245,0x08d8a48c,0x0b07b851,
        0x13d93d87,0x1808ea8c,0x00d1acb1 },
      { 0x06f36612,0x13481589,0x186362f4,0x07489dc0,0x157ee59c,0x14099841,
        0x1b0937e2,0x13a80ac4,0x007dcd07 } },
    /* 229 */
    { { 0x105a4b48,0x073ea69f,0x08c1dc97,0x1a52a46e,0x0915aadc,0x1cb8c095,
        0x06e3463d,0x1126efa3,0x000bf535 },
      { 0x0c68ea73,0x0f66cad3,0x0e96134d,0x07779504,0x1a723c7f,0x1a637a39,
        0x1bf27ed9,0x1b3c2cd0,0x00d28be4 } },
    /* 230 */
    { { 0x18fa8e4b,0x095cc831,0x0ff63f17,0x1e30dd12,0x1b6fc559,0x115521b7,
        0x0338e9b7,0x154a21f1,0x00d76007 },
      { 0x123a4988,0x088555b2,0x17409ccb,0x0b9e88e9,0x07278b45,0x184151a0,
        0x0c05fd19,0x0d166077,0x00f2b52f } },
    /* 231 */
    { { 0x1835b4ca,0x0abf57d4,0x19a72f03,0x0465f976,0x031982d2,0x1b406332,
        0x14ea3bba,0x11d98b5d,0x00d8dbe9 },
      { 0x05a02709,0x1d4df1fe,0x0e87ea32,0x1cd1cbeb,0x0a85230b,0x01e6f887,
        0x1c17faf5,0x147dcab2,0x00e01593 } },
    /* 232 */
    { { 0x0a75a0a6,0x1f2d7a87,0x01600cf4,0x044d58af,0x16406512,0x0a87e80b,
        0x1c19bf9b,0x1635d71d,0x00afec07 },
      { 0x00bb0a31,0x1dccab3c,0x0c26ab9f,0x15e7986e,0x1f3896f1,0x10ad00d5,
        0x1f76454e,0x0a8dc5b7,0x00a71b93 } },
    /* 233 */
    { { 0x18f593d2,0x1c709700,0x1e048aef,0x12085140,0x0f2add1a,0x02ed85d2,
        0x0f645414,0x0b8c50a4,0x0053a200 },
      { 0x07f2b935,0x1e45b1cf,0x00a58681,0x1f2eb583,0x0ca2c2bf,0x1753ba8c,
        0x18f61af3,0x1367ab11,0x00bf47d1 } },
    /* 234 */
    { { 0x1d7665d5,0x194b3d3e,0x0bd37959,0x0060ae5e,0x0903f4e3,0x02d7406a,
        0x06d85100,0x0fe73934,0x00001c2c },
      { 0x09efc6d6,0x01d400a3,0x11e9c905,0x017b54f7,0x150a4c81,0x1385d3c0,
        0x066d7d95,0x1cf0dff7,0x00fdadf8 } },
    /* 235 */
    { { 0x1fc00785,0x09c65c47,0x123ad9ff,0x14eb2276,0x08fbc77f,0x082adf9b,
        0x12501153,0x09ab5487,0x003a838e },
      { 0x1e97bb9a,0x10b31949,0x07653655,0x1266c688,0x12a839eb,0x08d3056d,
        0x168d4556,0x0af0e7c3,0x003cdb82 } },
    /* 236 */
    { { 0x1de77eab,0x1b8a054b,0x19204244,0x038a1a82,0x1d0dff7e,0x05696758,
        0x1ee9d8b7,0x113e3eaf,0x005a60cc },
      { 0x00d45673,0x059b1c12,0x04f19560,0x057c32b2,0x0b7411b8,0x025c6eb2,
        0x1f0015ca,0x0dfb7fb1,0x00922ff5 } },
    /* 237 */
    { { 0x09a129a1,0x1932ef76,0x0a138106,0x039caf98,0x1be3ca5b,0x0623675f,
        0x158810e0,0x0fbed8b9,0x0072919a },
      { 0x0fb90f9a,0x0c7a29d4,0x1900c6ca,0x13801711,0x11856d71,0x073bbcb7,
        0x026b8cb0,0x1006c481,0x005e7917 } },
    /* 238 */
    { { 0x1f63cdfb,0x00b762ab,0x12b93f57,0x146ae3e3,0x197ca8e6,0x15f52b02,
        0x1eaff389,0x0e3c4985,0x004e0a53 },
      { 0x05765357,0x1b52069d,0x1ce8ad09,0x135e881a,0x11a323c8,0x185720e8,
        0x13bae3cd,0x031aacc0,0x00f5ff78 } },
    /* 239 */
    { { 0x1a09df21,0x1f9f1ff0,0x1ba391fe,0x0ba51dcc,0x0901526d,0x1e8514e4,
        0x1990825a,0x1d2a67eb,0x00e41df0 },
      { 0x13ba9e3f,0x02fed205,0x0136254c,0x0819d64c,0x167c7f23,0x10c93f81,
        0x157c219b,0x0dd589e2,0x008edd7d } },
    /* 240 */
    { { 0x0bfc8ff3,0x0d0ee070,0x0dbd0bf2,0x1fb057d2,0x181ef14e,0x17be6651,
        0x1a599c05,0x195db15d,0x001432c1 },
      { 0x10b23c26,0x0342414b,0x0d6c9cfb,0x1fd0e60e,0x10f5aa64,0x1b72f577,
        0x0b1b8e27,0x016b591a,0x00caef48 } },
    /* 241 */
    { { 0x15315922,0x122e4bc3,0x18f32954,0x12a2e260,0x0f2cbd82,0x10685b27,
        0x08dbcf39,0x0fd1df5c,0x00d0ba17 },
      { 0x11b3af60,0x1d4d747d,0x0b688394,0x12d5ca7a,0x0ef281a7,0x1b02efcf,
        0x18580758,0x0f838a95,0x00f31c95 } },
    /* 242 */
    { { 0x09cc4597,0x07ac6a92,0x18280a30,0x002b6175,0x0814adc5,0x1e2ab9a5,
        0x10ebbf17,0x1972dc2f,0x00013404 },
      { 0x09a824bf,0x14f12c2e,0x07abb5ec,0x0630bc00,0x168acd59,0x134130f7,
        0x19b235bb,0x09723267,0x006f377c } },
    /* 243 */
    { { 0x08333fd2,0x1c9dd68d,0x0aa56e27,0x060404b4,0x15acea89,0x081bf57b,
        0x14188479,0x09da5a12,0x006dba3e },
      { 0x104399cd,0x0477cc66,0x0dceb7a9,0x038cddcd,0x0caf3181,0x03a960bf,
        0x129dcbd8,0x08477d9e,0x00f13cf3 } },
    /* 244 */
    { { 0x0919e2eb,0x175cf605,0x0b03da33,0x13432bec,0x0229983a,0x1ddb3d5d,
        0x0b4f3ee8,0x1524e977,0x00c83fa9 },
      { 0x02fa1ce0,0x0be8d85b,0x063befc3,0x16c1ea68,0x06f04e58,0x17cf2938,
        0x1a0efea3,0x1e8bae04,0x00b49d70 } },
    /* 245 */
    { { 0x1ad5513b,0x0a63a887,0x1d478b64,0x065dd962,0x19d5905f,0x020c6cfd,
        0x073db614,0x1761861e,0x0059cfad },
      { 0x15cb7fd6,0x0b3d611a,0x0109a8f8,0x06cf7104,0x18864249,0x02c64853,
        0x0d9fabbb,0x0c46a949,0x005babf3 } },
    /* 246 */
    { { 0x0e424865,0x1e4c0e8f,0x1955dfcd,0x0050f1e5,0x0c0588b0,0x1878dcf0,
        0x03c1c0a5,0x14f204d9,0x006188c6 },
      { 0x10f244da,0x17cd0cde,0x02021cc1,0x19dab9f6,0x136371ec,0x07cdcf90,
        0x0764d51c,0x0ebbea17,0x00993fe4 } },
    /* 247 */
    { { 0x1b2c3609,0x0718e6fc,0x11b53a9a,0x16338058,0x1510184e,0x160d4d3b,
        0x05adeb27,0x0cc9900c,0x0081f764 },
      { 0x15fbe978,0x0be152d3,0x00ecd587,0x07fda7e3,0x1d2bf674,0x0f82280e,
        0x18360e34,0x054bfd20,0x00564a81 } },
    /* 248 */
    { { 0x1a817d1d,0x12d327a7,0x0a0b83de,0x12d0897d,0x1f9aa55f,0x0d07e6ab,
        0x15b2d7fd,0x19e01ca3,0x00226bf3 },
      { 0x0f2833cf,0x168d4fc9,0x13e26a35,0x0146b49e,0x17f7720a,0x1624c79f,
        0x00d8454d,0x08ffe4af,0x0068779f } },
    /* 249 */
    { { 0x13043d08,0x0d860e0b,0x10083e9e,0x08cee83f,0x126d0a54,0x1f144d36,
        0x182f4dd9,0x1a3d6125,0x0097bcb0 },
      { 0x132ed3c3,0x15b75547,0x006f120a,0x09e2a365,0x178f3c8a,0x1a79dfd0,
        0x1955346f,0x1d014f08,0x00a872ff } },
    /* 250 */
    { { 0x032b2086,0x0d5bc9ad,0x183d21ac,0x16e21d02,0x0e6bee1e,0x06c89db5,
        0x0daa6f43,0x1f96e654,0x0002812b },
      { 0x0f605318,0x11febe56,0x1f5b4769,0x1cbaa1fb,0x0d619646,0x01cc1081,
        0x1abe875a,0x193fca72,0x0007391c } },
    /* 251 */
    { { 0x0b80d02b,0x080abf84,0x01dfdff1,0x0667a2c5,0x142ae6b8,0x0d7c3c6a,
        0x0821eb28,0x1b8fcda5,0x00355d2a },
      { 0x087386e1,0x00f99ad1,0x190c9d6d,0x0e5529f1,0x189eafd2,0x1166f3cc,
        0x09e4a1b2,0x1c6f8547,0x003dc2b1 } },
    /* 252 */
    { { 0x04581352,0x144e90e0,0x19e0afb5,0x01904a6e,0x1701f0a0,0x0ac84ff6,
        0x11ac80ef,0x020799b0,0x00c47869 },
      { 0x04c768ed,0x0dd3b841,0x107d95d7,0x1dd404d0,0x0ce0e72f,0x1f6ab566,
        0x14c9ccc4,0x0d1ab769,0x00ccc429 } },
    /* 253 */
    { { 0x1d7620b9,0x07286f09,0x04a95aa5,0x14b914b3,0x087c9d89,0x1b2033aa,
        0x073f7001,0x0855490e,0x00e147eb },
      { 0x0cf3ae46,0x1a55a775,0x0d43ef89,0x126df6a0,0x040eafd4,0x1f23a464,
        0x1b8f7cab,0x08e101d2,0x00239ac0 } },
    /* 254 */
    { { 0x0bfee8d4,0x00e8f9a9,0x1ec3fb12,0x016b9ff4,0x1af3cce8,0x064f1674,
        0x16744171,0x147ebefc,0x00c55fa1 },
      { 0x0257c227,0x0c378a74,0x0af802cc,0x02ca7e68,0x04fb2c5b,0x04cc5548,
        0x1a6426bf,0x139a9e96,0x00094cd9 } },
    /* 255 */
    { { 0x1703beba,0x14c0e426,0x13aca462,0x03a2a065,0x149ec863,0x1964f1de,
        0x14ce9117,0x16c85575,0x00b90a30 },
      { 0x14a5abf9,0x032a027d,0x16dd80ed,0x0ea186eb,0x1d89f004,0x0166651a,
        0x13ddbe69,0x13436f24,0x00019f8b } },
};

/* Multiply the base point of P256 by the scalar and return the result.
 * If map is true then convert result to affine coordinates.
 *
 * Stripe implementation.
 * Pre-generated: 2^0, 2^32, ...
 * Pre-generated: products of all combinations of above.
 * 8 doubles and adds (with qz=1)
 *
 * r     Resulting point.
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 * ct    Constant time required.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_256_ecc_mulmod_base_9(sp_point_256* r, const sp_digit* k,
        int map, int ct, void* heap)
{
    return sp_256_ecc_mulmod_stripe_9(r, &p256_base, p256_table,
                                      k, map, ct, heap);
}

#endif

/* Multiply the base point of P256 by the scalar and return the result.
 * If map is true then convert result to affine coordinates.
 *
 * km    Scalar to multiply by.
 * r     Resulting point.
 * map   Indicates whether to convert result to affine.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
int sp_ecc_mulmod_base_256(const mp_int* km, ecc_point* r, int map, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_256* point = NULL;
    sp_digit* k = NULL;
#else
    sp_point_256  point[1];
    sp_digit k[9];
#endif
    int err = MP_OKAY;

#ifdef WOLFSSL_SP_SMALL_STACK
    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap,
                                         DYNAMIC_TYPE_ECC);
    if (point == NULL)
        err = MEMORY_E;
    if (err == MP_OKAY) {
        k = (sp_digit*)XMALLOC(sizeof(sp_digit) * 9, heap,
                               DYNAMIC_TYPE_ECC);
        if (k == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        sp_256_from_mp(k, 9, km);

            err = sp_256_ecc_mulmod_base_9(point, k, map, 1, heap);
    }
    if (err == MP_OKAY) {
        err = sp_256_point_to_ecc_point_9(point, r);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (k != NULL)
        XFREE(k, heap, DYNAMIC_TYPE_ECC);
    if (point != NULL)
        XFREE(point, heap, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

/* Multiply the base point of P256 by the scalar, add point a and return
 * the result. If map is true then convert result to affine coordinates.
 *
 * km      Scalar to multiply by.
 * am      Point to add to scalar multiply result.
 * inMont  Point to add is in montgomery form.
 * r       Resulting point.
 * map     Indicates whether to convert result to affine.
 * heap    Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
int sp_ecc_mulmod_base_add_256(const mp_int* km, const ecc_point* am,
        int inMont, ecc_point* r, int map, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_256* point = NULL;
    sp_digit* k = NULL;
#else
    sp_point_256 point[2];
    sp_digit k[9 + 9 * 2 * 6];
#endif
    sp_point_256* addP = NULL;
    sp_digit* tmp = NULL;
    int err = MP_OKAY;

#ifdef WOLFSSL_SP_SMALL_STACK
    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap,
                                         DYNAMIC_TYPE_ECC);
    if (point == NULL)
        err = MEMORY_E;
    if (err == MP_OKAY) {
        k = (sp_digit*)XMALLOC(
            sizeof(sp_digit) * (9 + 9 * 2 * 6),
            heap, DYNAMIC_TYPE_ECC);
        if (k == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        addP = point + 1;
        tmp = k + 9;

        sp_256_from_mp(k, 9, km);
        sp_256_point_from_ecc_point_9(addP, am);
    }
    if ((err == MP_OKAY) && (!inMont)) {
        err = sp_256_mod_mul_norm_9(addP->x, addP->x, p256_mod);
    }
    if ((err == MP_OKAY) && (!inMont)) {
        err = sp_256_mod_mul_norm_9(addP->y, addP->y, p256_mod);
    }
    if ((err == MP_OKAY) && (!inMont)) {
        err = sp_256_mod_mul_norm_9(addP->z, addP->z, p256_mod);
    }
    if (err == MP_OKAY) {
            err = sp_256_ecc_mulmod_base_9(point, k, 0, 0, heap);
    }
    if (err == MP_OKAY) {
            sp_256_proj_point_add_9(point, point, addP, tmp);

        if (map) {
                sp_256_map_9(point, point, tmp);
        }

        err = sp_256_point_to_ecc_point_9(point, r);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (k != NULL)
        XFREE(k, heap, DYNAMIC_TYPE_ECC);
    if (point)
        XFREE(point, heap, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

#if defined(WOLFSSL_VALIDATE_ECC_KEYGEN) || defined(HAVE_ECC_SIGN) || \
                                                        defined(HAVE_ECC_VERIFY)
#endif /* WOLFSSL_VALIDATE_ECC_KEYGEN | HAVE_ECC_SIGN | HAVE_ECC_VERIFY */
/* Add 1 to a. (a = a + 1)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_256_add_one_9(sp_digit* a)
{
    a[0]++;
    sp_256_norm_9(a);
}

/* Read big endian unsigned byte array into r.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_256_from_bin(sp_digit* r, int size, const byte* a, int n)
{
    int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = n-1; i >= 0; i--) {
        r[j] |= (((sp_digit)a[i]) << s);
        if (s >= 21U) {
            r[j] &= 0x1fffffff;
            s = 29U - s;
            if (j + 1 >= size) {
                break;
            }
            r[++j] = (sp_digit)a[i] >> s;
            s = 8U - s;
        }
        else {
            s += 8U;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
}

/* Generates a scalar that is in the range 1..order-1.
 *
 * rng  Random number generator.
 * k    Scalar value.
 * returns RNG failures, MEMORY_E when memory allocation fails and
 * MP_OKAY on success.
 */
static int sp_256_ecc_gen_k_9(WC_RNG* rng, sp_digit* k)
{
    int err;
    byte buf[32];

    do {
        err = wc_RNG_GenerateBlock(rng, buf, sizeof(buf));
        if (err == 0) {
            sp_256_from_bin(k, 9, buf, (int)sizeof(buf));
            if (sp_256_cmp_9(k, p256_order2) <= 0) {
                sp_256_add_one_9(k);
                break;
            }
        }
    }
    while (err == 0);

    return err;
}

/* Makes a random EC key pair.
 *
 * rng   Random number generator.
 * priv  Generated private value.
 * pub   Generated public point.
 * heap  Heap to use for allocation.
 * returns ECC_INF_E when the point does not have the correct order, RNG
 * failures, MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
int sp_ecc_make_key_256(WC_RNG* rng, mp_int* priv, ecc_point* pub, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_256* point = NULL;
    sp_digit* k = NULL;
#else
    #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
    sp_point_256 point[2];
    #else
    sp_point_256 point[1];
    #endif
    sp_digit k[9];
#endif
#ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
    sp_point_256* infinity = NULL;
#endif
    int err = MP_OKAY;


    (void)heap;

#ifdef WOLFSSL_SP_SMALL_STACK
    #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap, DYNAMIC_TYPE_ECC);
    #else
    point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap, DYNAMIC_TYPE_ECC);
    #endif
    if (point == NULL)
        err = MEMORY_E;
    if (err == MP_OKAY) {
        k = (sp_digit*)XMALLOC(sizeof(sp_digit) * 9, heap,
                               DYNAMIC_TYPE_ECC);
        if (k == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
    #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
        infinity = point + 1;
    #endif

        err = sp_256_ecc_gen_k_9(rng, k);
    }
    if (err == MP_OKAY) {
            err = sp_256_ecc_mulmod_base_9(point, k, 1, 1, NULL);
    }

#ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
    if (err == MP_OKAY) {
            err = sp_256_ecc_mulmod_9(infinity, point, p256_order, 1, 1, NULL);
    }
    if (err == MP_OKAY) {
        if (sp_256_iszero_9(point->x) || sp_256_iszero_9(point->y)) {
            err = ECC_INF_E;
        }
    }
#endif

    if (err == MP_OKAY) {
        err = sp_256_to_mp(k, priv);
    }
    if (err == MP_OKAY) {
        err = sp_256_point_to_ecc_point_9(point, pub);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (k != NULL)
        XFREE(k, heap, DYNAMIC_TYPE_ECC);
    if (point != NULL) {
        /* point is not sensitive, so no need to zeroize */
        XFREE(point, heap, DYNAMIC_TYPE_ECC);
    }
#endif

    return err;
}

#ifdef WOLFSSL_SP_NONBLOCK
typedef struct sp_ecc_key_gen_256_ctx {
    int state;
    sp_256_ecc_mulmod_9_ctx mulmod_ctx;
    sp_digit k[9];
#ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
    sp_point_256  point[2];
#else
    sp_point_256 point[1];
#endif /* WOLFSSL_VALIDATE_ECC_KEYGEN */
} sp_ecc_key_gen_256_ctx;

int sp_ecc_make_key_256_nb(sp_ecc_ctx_t* sp_ctx, WC_RNG* rng, mp_int* priv,
    ecc_point* pub, void* heap)
{
    int err = FP_WOULDBLOCK;
    sp_ecc_key_gen_256_ctx* ctx = (sp_ecc_key_gen_256_ctx*)sp_ctx->data;
#ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
    sp_point_256* infinity = ctx->point + 1;
#endif /* WOLFSSL_VALIDATE_ECC_KEYGEN */

    typedef char ctx_size_test[sizeof(sp_ecc_key_gen_256_ctx)
                               >= sizeof(*sp_ctx) ? -1 : 1];
    (void)sizeof(ctx_size_test);

    switch (ctx->state) {
        case 0:
            err = sp_256_ecc_gen_k_9(rng, ctx->k);
            if (err == MP_OKAY) {
                err = FP_WOULDBLOCK;
                ctx->state = 1;
            }
            break;
        case 1:
            err = sp_256_ecc_mulmod_base_9_nb((sp_ecc_ctx_t*)&ctx->mulmod_ctx,
                      ctx->point, ctx->k, 1, 1, heap);
            if (err == MP_OKAY) {
                err = FP_WOULDBLOCK;
            #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
                XMEMSET(&ctx->mulmod_ctx, 0, sizeof(ctx->mulmod_ctx));
                ctx->state = 2;
            #else
                ctx->state = 3;
            #endif
            }
            break;
    #ifdef WOLFSSL_VALIDATE_ECC_KEYGEN
        case 2:
            err = sp_256_ecc_mulmod_9_nb((sp_ecc_ctx_t*)&ctx->mulmod_ctx,
                      infinity, ctx->point, p256_order, 1, 1);
            if (err == MP_OKAY) {
                if (sp_256_iszero_9(ctx->point->x) ||
                    sp_256_iszero_9(ctx->point->y)) {
                    err = ECC_INF_E;
                }
                else {
                    err = FP_WOULDBLOCK;
                    ctx->state = 3;
                }
            }
            break;
    #endif /* WOLFSSL_VALIDATE_ECC_KEYGEN */
        case 3:
            err = sp_256_to_mp(ctx->k, priv);
            if (err == MP_OKAY) {
                err = sp_256_point_to_ecc_point_9(ctx->point, pub);
            }
            break;
    }

    if (err != FP_WOULDBLOCK) {
        XMEMSET(ctx, 0, sizeof(sp_ecc_key_gen_256_ctx));
    }

    return err;
}
#endif /* WOLFSSL_SP_NONBLOCK */

#ifdef HAVE_ECC_DHE
/* Write r as big endian to byte array.
 * Fixed length number of bytes written: 32
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_256_to_bin_9(sp_digit* r, byte* a)
{
    int i;
    int j;
    int s = 0;
    int b;

    for (i=0; i<8; i++) {
        r[i+1] += r[i] >> 29;
        r[i] &= 0x1fffffff;
    }
    j = 263 / 8 - 1;
    a[j] = 0;
    for (i=0; i<9 && j>=0; i++) {
        b = 0;
        /* lint allow cast of mismatch sp_digit and int */
        a[j--] |= (byte)(r[i] << s); /*lint !e9033*/
        b += 8 - s;
        if (j < 0) {
            break;
        }
        while (b < 29) {
            a[j--] = (byte)(r[i] >> b);
            b += 8;
            if (j < 0) {
                break;
            }
        }
        s = 8 - (b - 29);
        if (j >= 0) {
            a[j] = 0;
        }
        if (s != 0) {
            j++;
        }
    }
}

/* Multiply the point by the scalar and serialize the X ordinate.
 * The number is 0 padded to maximum size on output.
 *
 * priv    Scalar to multiply the point by.
 * pub     Point to multiply.
 * out     Buffer to hold X ordinate.
 * outLen  On entry, size of the buffer in bytes.
 *         On exit, length of data in buffer in bytes.
 * heap    Heap to use for allocation.
 * returns BUFFER_E if the buffer is to small for output size,
 * MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
int sp_ecc_secret_gen_256(const mp_int* priv, const ecc_point* pub, byte* out,
                          word32* outLen, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_256* point = NULL;
    sp_digit* k = NULL;
#else
    sp_point_256 point[1];
    sp_digit k[9];
#endif
    int err = MP_OKAY;

    if (*outLen < 32U) {
        err = BUFFER_E;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap,
                                         DYNAMIC_TYPE_ECC);
        if (point == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        k = (sp_digit*)XMALLOC(sizeof(sp_digit) * 9, heap,
                               DYNAMIC_TYPE_ECC);
        if (k == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        sp_256_from_mp(k, 9, priv);
        sp_256_point_from_ecc_point_9(point, pub);
            err = sp_256_ecc_mulmod_9(point, point, k, 1, 1, heap);
    }
    if (err == MP_OKAY) {
        sp_256_to_bin_9(point->x, out);
        *outLen = 32;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (k != NULL)
        XFREE(k, heap, DYNAMIC_TYPE_ECC);
    if (point != NULL)
        XFREE(point, heap, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

#ifdef WOLFSSL_SP_NONBLOCK
typedef struct sp_ecc_sec_gen_256_ctx {
    int state;
    union {
        sp_256_ecc_mulmod_9_ctx mulmod_ctx;
    };
    sp_digit k[9];
    sp_point_256 point;
} sp_ecc_sec_gen_256_ctx;

int sp_ecc_secret_gen_256_nb(sp_ecc_ctx_t* sp_ctx, const mp_int* priv,
    const ecc_point* pub, byte* out, word32* outLen, void* heap)
{
    int err = FP_WOULDBLOCK;
    sp_ecc_sec_gen_256_ctx* ctx = (sp_ecc_sec_gen_256_ctx*)sp_ctx->data;

    typedef char ctx_size_test[sizeof(sp_ecc_sec_gen_256_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
    (void)sizeof(ctx_size_test);

    if (*outLen < 32U) {
        err = BUFFER_E;
    }

    switch (ctx->state) {
        case 0:
            sp_256_from_mp(ctx->k, 9, priv);
            sp_256_point_from_ecc_point_9(&ctx->point, pub);
            ctx->state = 1;
            break;
        case 1:
            err = sp_256_ecc_mulmod_9_nb((sp_ecc_ctx_t*)&ctx->mulmod_ctx,
                      &ctx->point, &ctx->point, ctx->k, 1, 1, heap);
            if (err == MP_OKAY) {
                sp_256_to_bin_9(ctx->point.x, out);
                *outLen = 32;
            }
            break;
    }

    if (err == MP_OKAY && ctx->state != 1) {
        err = FP_WOULDBLOCK;
    }
    if (err != FP_WOULDBLOCK) {
        XMEMSET(ctx, 0, sizeof(sp_ecc_sec_gen_256_ctx));
    }

    return err;
}
#endif /* WOLFSSL_SP_NONBLOCK */
#endif /* HAVE_ECC_DHE */

#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
#endif
#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
SP_NOINLINE static void sp_256_rshift_9(sp_digit* r, const sp_digit* a,
        byte n)
{
    int i;

#ifdef WOLFSSL_SP_SMALL
    for (i=0; i<8; i++) {
        r[i] = ((a[i] >> n) | (a[i + 1] << (29 - n))) & 0x1fffffff;
    }
#else
    for (i=0; i<8; i += 8) {
        r[i+0] = (a[i+0] >> n) | ((a[i+1] << (29 - n)) & 0x1fffffff);
        r[i+1] = (a[i+1] >> n) | ((a[i+2] << (29 - n)) & 0x1fffffff);
        r[i+2] = (a[i+2] >> n) | ((a[i+3] << (29 - n)) & 0x1fffffff);
        r[i+3] = (a[i+3] >> n) | ((a[i+4] << (29 - n)) & 0x1fffffff);
        r[i+4] = (a[i+4] >> n) | ((a[i+5] << (29 - n)) & 0x1fffffff);
        r[i+5] = (a[i+5] >> n) | ((a[i+6] << (29 - n)) & 0x1fffffff);
        r[i+6] = (a[i+6] >> n) | ((a[i+7] << (29 - n)) & 0x1fffffff);
        r[i+7] = (a[i+7] >> n) | ((a[i+8] << (29 - n)) & 0x1fffffff);
    }
#endif /* WOLFSSL_SP_SMALL */
    r[8] = a[8] >> n;
}

/* Multiply a by scalar b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_256_mul_d_9(sp_digit* r, const sp_digit* a,
    sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    sp_int64 tb = b;
    sp_int64 t = 0;
    int i;

    for (i = 0; i < 9; i++) {
        t += tb * a[i];
        r[i] = (sp_digit)(t & 0x1fffffff);
        t >>= 29;
    }
    r[9] = (sp_digit)t;
#else
    sp_int64 tb = b;
    sp_int64 t[9];

    t[ 0] = tb * a[ 0];
    t[ 1] = tb * a[ 1];
    t[ 2] = tb * a[ 2];
    t[ 3] = tb * a[ 3];
    t[ 4] = tb * a[ 4];
    t[ 5] = tb * a[ 5];
    t[ 6] = tb * a[ 6];
    t[ 7] = tb * a[ 7];
    t[ 8] = tb * a[ 8];
    r[ 0] = (sp_digit)                 (t[ 0] & 0x1fffffff);
    r[ 1] = (sp_digit)((t[ 0] >> 29) + (t[ 1] & 0x1fffffff));
    r[ 2] = (sp_digit)((t[ 1] >> 29) + (t[ 2] & 0x1fffffff));
    r[ 3] = (sp_digit)((t[ 2] >> 29) + (t[ 3] & 0x1fffffff));
    r[ 4] = (sp_digit)((t[ 3] >> 29) + (t[ 4] & 0x1fffffff));
    r[ 5] = (sp_digit)((t[ 4] >> 29) + (t[ 5] & 0x1fffffff));
    r[ 6] = (sp_digit)((t[ 5] >> 29) + (t[ 6] & 0x1fffffff));
    r[ 7] = (sp_digit)((t[ 6] >> 29) + (t[ 7] & 0x1fffffff));
    r[ 8] = (sp_digit)((t[ 7] >> 29) + (t[ 8] & 0x1fffffff));
    r[ 9] = (sp_digit) (t[ 8] >> 29);
#endif /* WOLFSSL_SP_SMALL */
}

SP_NOINLINE static void sp_256_lshift_18(sp_digit* r, const sp_digit* a,
        byte n)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    r[18] = a[17] >> (29 - n);
    for (i=17; i>0; i--) {
        r[i] = ((a[i] << n) | (a[i-1] >> (29 - n))) & 0x1fffffff;
    }
#else
    sp_int_digit s;
    sp_int_digit t;

    s = (sp_int_digit)a[17];
    r[18] = s >> (29U - n);
    s = (sp_int_digit)(a[17]); t = (sp_int_digit)(a[16]);
    r[17] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[16]); t = (sp_int_digit)(a[15]);
    r[16] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[15]); t = (sp_int_digit)(a[14]);
    r[15] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[14]); t = (sp_int_digit)(a[13]);
    r[14] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[13]); t = (sp_int_digit)(a[12]);
    r[13] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[12]); t = (sp_int_digit)(a[11]);
    r[12] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[11]); t = (sp_int_digit)(a[10]);
    r[11] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[10]); t = (sp_int_digit)(a[9]);
    r[10] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[9]); t = (sp_int_digit)(a[8]);
    r[9] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[8]); t = (sp_int_digit)(a[7]);
    r[8] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[7]); t = (sp_int_digit)(a[6]);
    r[7] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[6]); t = (sp_int_digit)(a[5]);
    r[6] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[5]); t = (sp_int_digit)(a[4]);
    r[5] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[4]); t = (sp_int_digit)(a[3]);
    r[4] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[3]); t = (sp_int_digit)(a[2]);
    r[3] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[2]); t = (sp_int_digit)(a[1]);
    r[2] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
    s = (sp_int_digit)(a[1]); t = (sp_int_digit)(a[0]);
    r[1] = ((s << n) | (t >> (29U - n))) & 0x1fffffff;
#endif /* WOLFSSL_SP_SMALL */
    r[0] = (a[0] << n) & 0x1fffffff;
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * Simplified based on top word of divisor being (1 << 29) - 1
 *
 * a  Number to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_256_div_9(const sp_digit* a, const sp_digit* d,
        const sp_digit* m, sp_digit* r)
{
    int i;
    sp_digit r1;
    sp_digit mask;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* t1 = NULL;
#else
    sp_digit t1[4 * 9 + 3];
#endif
    sp_digit* t2 = NULL;
    sp_digit* sd = NULL;
    int err = MP_OKAY;

    (void)m;

#ifdef WOLFSSL_SP_SMALL_STACK
    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * (4 * 9 + 3), NULL,
                                                       DYNAMIC_TYPE_TMP_BUFFER);
    if (t1 == NULL)
        err = MEMORY_E;
#endif

    (void)m;

    if (err == MP_OKAY) {
        t2 = t1 + 18 + 1;
        sd = t2 + 9 + 1;

        sp_256_mul_d_9(sd, d, (sp_digit)1 << 5);
        sp_256_lshift_18(t1, a, 5);
        t1[9 + 9] += t1[9 + 9 - 1] >> 29;
        t1[9 + 9 - 1] &= 0x1fffffff;
        for (i=8; i>=0; i--) {
            r1 = t1[9 + i];
            sp_256_mul_d_9(t2, sd, r1);
            (void)sp_256_sub_9(&t1[i], &t1[i], t2);
            t1[9 + i] -= t2[9];
            sp_256_norm_9(&t1[i + 1]);

            mask = ~((t1[9 + i] - 1) >> 31);
            sp_256_cond_sub_9(t1 + i, t1 + i, sd, mask);
            sp_256_norm_9(&t1[i + 1]);
        }
        sp_256_norm_9(t1);
        sp_256_rshift_9(r, t1, 5);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t1 != NULL)
        XFREE(t1, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MEMORY_E when unable to allocate memory and MP_OKAY otherwise.
 */
static int sp_256_mod_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    return sp_256_div_9(a, m, NULL, r);
}

#endif
#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
/* Multiply two number mod the order of P256 curve. (r = a * b mod order)
 *
 * r  Result of the multiplication.
 * a  First operand of the multiplication.
 * b  Second operand of the multiplication.
 */
static void sp_256_mont_mul_order_9(sp_digit* r, const sp_digit* a, const sp_digit* b)
{
    sp_256_mul_9(r, a, b);
    sp_256_mont_reduce_order_9(r, p256_order, p256_mp_order);
}

#if defined(HAVE_ECC_SIGN) || (defined(HAVE_ECC_VERIFY) && defined(WOLFSSL_SP_SMALL))
#ifdef WOLFSSL_SP_SMALL
/* Order-2 for the P256 curve. */
static const uint32_t p256_order_minus_2[8] = {
    0xfc63254fU,0xf3b9cac2U,0xa7179e84U,0xbce6faadU,0xffffffffU,0xffffffffU,
    0x00000000U,0xffffffffU
};
#else
/* The low half of the order-2 of the P256 curve. */
static const sp_int_digit p256_order_low[4] = {
    0xfc63254fU,0xf3b9cac2U,0xa7179e84U,0xbce6faadU
};
#endif /* WOLFSSL_SP_SMALL */

/* Square number mod the order of P256 curve. (r = a * a mod order)
 *
 * r  Result of the squaring.
 * a  Number to square.
 */
static void sp_256_mont_sqr_order_9(sp_digit* r, const sp_digit* a)
{
    sp_256_sqr_9(r, a);
    sp_256_mont_reduce_order_9(r, p256_order, p256_mp_order);
}

#ifndef WOLFSSL_SP_SMALL
/* Square number mod the order of P256 curve a number of times.
 * (r = a ^ n mod order)
 *
 * r  Result of the squaring.
 * a  Number to square.
 */
static void sp_256_mont_sqr_n_order_9(sp_digit* r, const sp_digit* a, int n)
{
    int i;

    sp_256_mont_sqr_order_9(r, a);
    for (i=1; i<n; i++) {
        sp_256_mont_sqr_order_9(r, r);
    }
}
#endif /* !WOLFSSL_SP_SMALL */

/* Invert the number, in Montgomery form, modulo the order of the P256 curve.
 * (r = 1 / a mod order)
 *
 * r   Inverse result.
 * a   Number to invert.
 * td  Temporary data.
 */

#ifdef WOLFSSL_SP_NONBLOCK
typedef struct sp_256_mont_inv_order_9_ctx {
    int state;
    int i;
} sp_256_mont_inv_order_9_ctx;
static int sp_256_mont_inv_order_9_nb(sp_ecc_ctx_t* sp_ctx, sp_digit* r, const sp_digit* a,
        sp_digit* t)
{
    int err = FP_WOULDBLOCK;
    sp_256_mont_inv_order_9_ctx* ctx = (sp_256_mont_inv_order_9_ctx*)sp_ctx;

    typedef char ctx_size_test[sizeof(sp_256_mont_inv_order_9_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
    (void)sizeof(ctx_size_test);

    switch (ctx->state) {
    case 0:
        XMEMCPY(t, a, sizeof(sp_digit) * 9);
        ctx->i = 254;
        ctx->state = 1;
        break;
    case 1:
        sp_256_mont_sqr_order_9(t, t);
        ctx->state = 2;
        break;
    case 2:
        if ((p256_order_minus_2[ctx->i / 32] & ((sp_int_digit)1 << (ctx->i % 32))) != 0) {
            sp_256_mont_mul_order_9(t, t, a);
        }
        ctx->i--;
        ctx->state = (ctx->i == 0) ? 3 : 1;
        break;
    case 3:
        XMEMCPY(r, t, sizeof(sp_digit) * 9U);
        err = MP_OKAY;
        break;
    }
    return err;
}
#endif /* WOLFSSL_SP_NONBLOCK */

static void sp_256_mont_inv_order_9(sp_digit* r, const sp_digit* a,
        sp_digit* td)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* t = td;
    int i;

    XMEMCPY(t, a, sizeof(sp_digit) * 9);
    for (i=254; i>=0; i--) {
        sp_256_mont_sqr_order_9(t, t);
        if ((p256_order_minus_2[i / 32] & ((sp_int_digit)1 << (i % 32))) != 0) {
            sp_256_mont_mul_order_9(t, t, a);
        }
    }
    XMEMCPY(r, t, sizeof(sp_digit) * 9U);
#else
    sp_digit* t = td;
    sp_digit* t2 = td + 2 * 9;
    sp_digit* t3 = td + 4 * 9;
    int i;

    /* t = a^2 */
    sp_256_mont_sqr_order_9(t, a);
    /* t = a^3 = t * a */
    sp_256_mont_mul_order_9(t, t, a);
    /* t2= a^c = t ^ 2 ^ 2 */
    sp_256_mont_sqr_n_order_9(t2, t, 2);
    /* t3= a^f = t2 * t */
    sp_256_mont_mul_order_9(t3, t2, t);
    /* t2= a^f0 = t3 ^ 2 ^ 4 */
    sp_256_mont_sqr_n_order_9(t2, t3, 4);
    /* t = a^ff = t2 * t3 */
    sp_256_mont_mul_order_9(t, t2, t3);
    /* t2= a^ff00 = t ^ 2 ^ 8 */
    sp_256_mont_sqr_n_order_9(t2, t, 8);
    /* t = a^ffff = t2 * t */
    sp_256_mont_mul_order_9(t, t2, t);
    /* t2= a^ffff0000 = t ^ 2 ^ 16 */
    sp_256_mont_sqr_n_order_9(t2, t, 16);
    /* t = a^ffffffff = t2 * t */
    sp_256_mont_mul_order_9(t, t2, t);
    /* t2= a^ffffffff0000000000000000 = t ^ 2 ^ 64  */
    sp_256_mont_sqr_n_order_9(t2, t, 64);
    /* t2= a^ffffffff00000000ffffffff = t2 * t */
    sp_256_mont_mul_order_9(t2, t2, t);
    /* t2= a^ffffffff00000000ffffffff00000000 = t2 ^ 2 ^ 32  */
    sp_256_mont_sqr_n_order_9(t2, t2, 32);
    /* t2= a^ffffffff00000000ffffffffffffffff = t2 * t */
    sp_256_mont_mul_order_9(t2, t2, t);
    /* t2= a^ffffffff00000000ffffffffffffffffbce6 */
    sp_256_mont_sqr_order_9(t2, t2);
    sp_256_mont_mul_order_9(t2, t2, a);
    sp_256_mont_sqr_n_order_9(t2, t2, 5);
    sp_256_mont_mul_order_9(t2, t2, t3);
    for (i=121; i>=112; i--) {
        sp_256_mont_sqr_order_9(t2, t2);
        if ((p256_order_low[i / 32] & ((sp_int_digit)1 << (i % 32))) != 0) {
            sp_256_mont_mul_order_9(t2, t2, a);
        }
    }
    /* t2= a^ffffffff00000000ffffffffffffffffbce6f */
    sp_256_mont_sqr_n_order_9(t2, t2, 4);
    sp_256_mont_mul_order_9(t2, t2, t3);
    /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84 */
    for (i=107; i>=64; i--) {
        sp_256_mont_sqr_order_9(t2, t2);
        if ((p256_order_low[i / 32] & ((sp_int_digit)1 << (i % 32))) != 0) {
            sp_256_mont_mul_order_9(t2, t2, a);
        }
    }
    /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f */
    sp_256_mont_sqr_n_order_9(t2, t2, 4);
    sp_256_mont_mul_order_9(t2, t2, t3);
    /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2 */
    for (i=59; i>=32; i--) {
        sp_256_mont_sqr_order_9(t2, t2);
        if ((p256_order_low[i / 32] & ((sp_int_digit)1 << (i % 32))) != 0) {
            sp_256_mont_mul_order_9(t2, t2, a);
        }
    }
    /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2f */
    sp_256_mont_sqr_n_order_9(t2, t2, 4);
    sp_256_mont_mul_order_9(t2, t2, t3);
    /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254 */
    for (i=27; i>=0; i--) {
        sp_256_mont_sqr_order_9(t2, t2);
        if ((p256_order_low[i / 32] & ((sp_int_digit)1 << (i % 32))) != 0) {
            sp_256_mont_mul_order_9(t2, t2, a);
        }
    }
    /* t2= a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632540 */
    sp_256_mont_sqr_n_order_9(t2, t2, 4);
    /* r = a^ffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63254f */
    sp_256_mont_mul_order_9(r, t2, t3);
#endif /* WOLFSSL_SP_SMALL */
}

#endif /* HAVE_ECC_SIGN || (HAVE_ECC_VERIFY && WOLFSSL_SP_SMALL) */
#endif /* HAVE_ECC_SIGN | HAVE_ECC_VERIFY */
#ifdef HAVE_ECC_SIGN
#ifndef SP_ECC_MAX_SIG_GEN
#define SP_ECC_MAX_SIG_GEN  64
#endif

/* Calculate second signature value S from R, k and private value.
 *
 * s = (r * x + e) / k
 *
 * s    Signature value.
 * r    First signature value.
 * k    Ephemeral private key.
 * x    Private key as a number.
 * e    Hash of message as a number.
 * tmp  Temporary storage for intermediate numbers.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_256_calc_s_9(sp_digit* s, const sp_digit* r, sp_digit* k,
    sp_digit* x, const sp_digit* e, sp_digit* tmp)
{
    int err;
    sp_digit carry;
    sp_int32 c;
    sp_digit* kInv = k;

    /* Conv k to Montgomery form (mod order) */
        sp_256_mul_9(k, k, p256_norm_order);
    err = sp_256_mod_9(k, k, p256_order);
    if (err == MP_OKAY) {
        sp_256_norm_9(k);

        /* kInv = 1/k mod order */
            sp_256_mont_inv_order_9(kInv, k, tmp);
        sp_256_norm_9(kInv);

        /* s = r * x + e */
            sp_256_mul_9(x, x, r);
        err = sp_256_mod_9(x, x, p256_order);
    }
    if (err == MP_OKAY) {
        sp_256_norm_9(x);
        carry = sp_256_add_9(s, e, x);
        sp_256_cond_sub_9(s, s, p256_order, 0 - carry);
        sp_256_norm_9(s);
        c = sp_256_cmp_9(s, p256_order);
        sp_256_cond_sub_9(s, s, p256_order,
            (sp_digit)0 - (sp_digit)(c >= 0));
        sp_256_norm_9(s);

        /* s = s * k^-1 mod order */
            sp_256_mont_mul_order_9(s, s, kInv);
        sp_256_norm_9(s);
    }

    return err;
}

/* Sign the hash using the private key.
 *   e = [hash, 256 bits] from binary
 *   r = (k.G)->x mod order
 *   s = (r * x + e) / k mod order
 * The hash is truncated to the first 256 bits.
 *
 * hash     Hash to sign.
 * hashLen  Length of the hash data.
 * rng      Random number generator.
 * priv     Private part of key - scalar.
 * rm       First part of result as an mp_int.
 * sm       Sirst part of result as an mp_int.
 * heap     Heap to use for allocation.
 * returns RNG failures, MEMORY_E when memory allocation fails and
 * MP_OKAY on success.
 */
int sp_ecc_sign_256(const byte* hash, word32 hashLen, WC_RNG* rng,
    const mp_int* priv, mp_int* rm, mp_int* sm, mp_int* km, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* e = NULL;
    sp_point_256* point = NULL;
#else
    sp_digit e[7 * 2 * 9];
    sp_point_256 point[1];
#endif
    sp_digit* x = NULL;
    sp_digit* k = NULL;
    sp_digit* r = NULL;
    sp_digit* tmp = NULL;
    sp_digit* s = NULL;
    sp_int32 c;
    int err = MP_OKAY;
    int i;

    (void)heap;

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        point = (sp_point_256*)XMALLOC(sizeof(sp_point_256), heap,
                                             DYNAMIC_TYPE_ECC);
        if (point == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        e = (sp_digit*)XMALLOC(sizeof(sp_digit) * 7 * 2 * 9, heap,
                               DYNAMIC_TYPE_ECC);
        if (e == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        x = e + 2 * 9;
        k = e + 4 * 9;
        r = e + 6 * 9;
        tmp = e + 8 * 9;
        s = e;

        if (hashLen > 32U) {
            hashLen = 32U;
        }
    }

    for (i = SP_ECC_MAX_SIG_GEN; err == MP_OKAY && i > 0; i--) {
        /* New random point. */
        if (km == NULL || mp_iszero(km)) {
            err = sp_256_ecc_gen_k_9(rng, k);
        }
        else {
            sp_256_from_mp(k, 9, km);
            mp_zero(km);
        }
        if (err == MP_OKAY) {
                err = sp_256_ecc_mulmod_base_9(point, k, 1, 1, heap);
        }

        if (err == MP_OKAY) {
            /* r = point->x mod order */
            XMEMCPY(r, point->x, sizeof(sp_digit) * 9U);
            sp_256_norm_9(r);
            c = sp_256_cmp_9(r, p256_order);
            sp_256_cond_sub_9(r, r, p256_order,
                (sp_digit)0 - (sp_digit)(c >= 0));
            sp_256_norm_9(r);

            if (!sp_256_iszero_9(r)) {
                /* x is modified in calculation of s. */
                sp_256_from_mp(x, 9, priv);
                /* s ptr == e ptr, e is modified in calculation of s. */
                sp_256_from_bin(e, 9, hash, (int)hashLen);

                err = sp_256_calc_s_9(s, r, k, x, e, tmp);

                /* Check that signature is usable. */
                if ((err == MP_OKAY) && (!sp_256_iszero_9(s))) {
                    break;
                }
            }
        }
#ifdef WOLFSSL_ECDSA_SET_K_ONE_LOOP
        i = 1;
#endif
    }

    if (i == 0) {
        err = RNG_FAILURE_E;
    }

    if (err == MP_OKAY) {
        err = sp_256_to_mp(r, rm);
    }
    if (err == MP_OKAY) {
        err = sp_256_to_mp(s, sm);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (e != NULL)
#endif
    {
        ForceZero(e, sizeof(sp_digit) * 7 * 2 * 9);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(e, heap, DYNAMIC_TYPE_ECC);
    #endif
    }
#ifdef WOLFSSL_SP_SMALL_STACK
    if (point != NULL)
#endif
    {
        ForceZero(point, sizeof(sp_point_256));
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(point, heap, DYNAMIC_TYPE_ECC);
    #endif
    }

    return err;
}

#ifdef WOLFSSL_SP_NONBLOCK
typedef struct sp_ecc_sign_256_ctx {
    int state;
    union {
        sp_256_ecc_mulmod_9_ctx mulmod_ctx;
        sp_256_mont_inv_order_9_ctx mont_inv_order_ctx;
    };
    sp_digit e[2*9];
    sp_digit x[2*9];
    sp_digit k[2*9];
    sp_digit r[2*9];
    sp_digit tmp[3 * 2*9];
    sp_point_256 point;
    sp_digit* s;
    sp_digit* kInv;
    int i;
} sp_ecc_sign_256_ctx;

int sp_ecc_sign_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash, word32 hashLen, WC_RNG* rng,
    mp_int* priv, mp_int* rm, mp_int* sm, mp_int* km, void* heap)
{
    int err = FP_WOULDBLOCK;
    sp_ecc_sign_256_ctx* ctx = (sp_ecc_sign_256_ctx*)sp_ctx->data;

    typedef char ctx_size_test[sizeof(sp_ecc_sign_256_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
    (void)sizeof(ctx_size_test);

    switch (ctx->state) {
    case 0: /* INIT */
        ctx->s = ctx->e;
        ctx->kInv = ctx->k;

        ctx->i = SP_ECC_MAX_SIG_GEN;
        ctx->state = 1;
        break;
    case 1: /* GEN */
        /* New random point. */
        if (km == NULL || mp_iszero(km)) {
            err = sp_256_ecc_gen_k_9(rng, ctx->k);
        }
        else {
            sp_256_from_mp(ctx->k, 9, km);
            mp_zero(km);
        }
        XMEMSET(&ctx->mulmod_ctx, 0, sizeof(ctx->mulmod_ctx));
        ctx->state = 2;
        break;
    case 2: /* MULMOD */
        err = sp_256_ecc_mulmod_9_nb((sp_ecc_ctx_t*)&ctx->mulmod_ctx,
            &ctx->point, &p256_base, ctx->k, 1, 1, heap);
        if (err == MP_OKAY) {
            ctx->state = 3;
        }
        break;
    case 3: /* MODORDER */
    {
        sp_int32 c;
        /* r = point->x mod order */
        XMEMCPY(ctx->r, ctx->point.x, sizeof(sp_digit) * 9U);
        sp_256_norm_9(ctx->r);
        c = sp_256_cmp_9(ctx->r, p256_order);
        sp_256_cond_sub_9(ctx->r, ctx->r, p256_order,
            (sp_digit)0 - (sp_digit)(c >= 0));
        sp_256_norm_9(ctx->r);

        if (hashLen > 32U) {
            hashLen = 32U;
        }
        sp_256_from_mp(ctx->x, 9, priv);
        sp_256_from_bin(ctx->e, 9, hash, (int)hashLen);
        ctx->state = 4;
        break;
    }
    case 4: /* KMODORDER */
        /* Conv k to Montgomery form (mod order) */
        sp_256_mul_9(ctx->k, ctx->k, p256_norm_order);
        err = sp_256_mod_9(ctx->k, ctx->k, p256_order);
        if (err == MP_OKAY) {
            sp_256_norm_9(ctx->k);
            XMEMSET(&ctx->mont_inv_order_ctx, 0, sizeof(ctx->mont_inv_order_ctx));
            ctx->state = 5;
        }
        break;
    case 5: /* KINV */
        /* kInv = 1/k mod order */
        err = sp_256_mont_inv_order_9_nb((sp_ecc_ctx_t*)&ctx->mont_inv_order_ctx, ctx->kInv, ctx->k, ctx->tmp);
        if (err == MP_OKAY) {
            XMEMSET(&ctx->mont_inv_order_ctx, 0, sizeof(ctx->mont_inv_order_ctx));
            ctx->state = 6;
        }
        break;
    case 6: /* KINVNORM */
        sp_256_norm_9(ctx->kInv);
        ctx->state = 7;
        break;
    case 7: /* R */
        /* s = r * x + e */
        sp_256_mul_9(ctx->x, ctx->x, ctx->r);
        ctx->state = 8;
        break;
    case 8: /* S1 */
        err = sp_256_mod_9(ctx->x, ctx->x, p256_order);
        if (err == MP_OKAY)
            ctx->state = 9;
        break;
    case 9: /* S2 */
    {
        sp_digit carry;
        sp_int32 c;
        sp_256_norm_9(ctx->x);
        carry = sp_256_add_9(ctx->s, ctx->e, ctx->x);
        sp_256_cond_sub_9(ctx->s, ctx->s,
            p256_order, 0 - carry);
        sp_256_norm_9(ctx->s);
        c = sp_256_cmp_9(ctx->s, p256_order);
        sp_256_cond_sub_9(ctx->s, ctx->s, p256_order,
            (sp_digit)0 - (sp_digit)(c >= 0));
        sp_256_norm_9(ctx->s);

        /* s = s * k^-1 mod order */
        sp_256_mont_mul_order_9(ctx->s, ctx->s, ctx->kInv);
        sp_256_norm_9(ctx->s);

        /* Check that signature is usable. */
        if (sp_256_iszero_9(ctx->s) == 0) {
            ctx->state = 10;
            break;
        }
    #ifdef WOLFSSL_ECDSA_SET_K_ONE_LOOP
        ctx->i = 1;
    #endif

        /* not usable gen, try again */
        ctx->i--;
        if (ctx->i == 0) {
            err = RNG_FAILURE_E;
        }
        ctx->state = 1;
        break;
    }
    case 10: /* RES */
        err = sp_256_to_mp(ctx->r, rm);
        if (err == MP_OKAY) {
            err = sp_256_to_mp(ctx->s, sm);
        }
        break;
    }

    if (err == MP_OKAY && ctx->state != 10) {
        err = FP_WOULDBLOCK;
    }
    if (err != FP_WOULDBLOCK) {
        XMEMSET(ctx->e, 0, sizeof(sp_digit) * 2U * 9U);
        XMEMSET(ctx->x, 0, sizeof(sp_digit) * 2U * 9U);
        XMEMSET(ctx->k, 0, sizeof(sp_digit) * 2U * 9U);
        XMEMSET(ctx->r, 0, sizeof(sp_digit) * 2U * 9U);
        XMEMSET(ctx->tmp, 0, sizeof(sp_digit) * 3U * 2U * 9U);
    }

    return err;
}
#endif /* WOLFSSL_SP_NONBLOCK */
#endif /* HAVE_ECC_SIGN */

#ifndef WOLFSSL_SP_SMALL
static const char sp_256_tab32_9[32] = {
     1, 10,  2, 11, 14, 22,  3, 30,
    12, 15, 17, 19, 23, 26,  4, 31,
     9, 13, 21, 29, 16, 18, 25,  8,
    20, 28, 24,  7, 27,  6,  5, 32};

static int sp_256_num_bits_29_9(sp_digit v)
{
    v |= v >> 1;
    v |= v >> 2;
    v |= v >> 4;
    v |= v >> 8;
    v |= v >> 16;
    return sp_256_tab32_9[(uint32_t)(v*0x07C4ACDD) >> 27];
}

static int sp_256_num_bits_9(const sp_digit* a)
{
    int i;
    int r = 0;

    for (i = 8; i >= 0; i--) {
        if (a[i] != 0) {
            r = sp_256_num_bits_29_9(a[i]);
            r += i * 29;
            break;
        }
    }

    return r;
}

/* Non-constant time modular inversion.
 *
 * @param  [out]  r   Resulting number.
 * @param  [in]   a   Number to invert.
 * @param  [in]   m   Modulus.
 * @return  MP_OKAY on success.
 * @return  MEMEORY_E when dynamic memory allocation fails.
 */
static int sp_256_mod_inv_9(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    int err = MP_OKAY;
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* u = NULL;
#else
    sp_digit u[9 * 4];
#endif
    sp_digit* v = NULL;
    sp_digit* b = NULL;
    sp_digit* d = NULL;
    int ut;
    int vt;

#ifdef WOLFSSL_SP_SMALL_STACK
    u = (sp_digit*)XMALLOC(sizeof(sp_digit) * 9 * 4, NULL,
                                                              DYNAMIC_TYPE_ECC);
    if (u == NULL)
        err = MEMORY_E;
#endif

    if (err == MP_OKAY) {
        v = u + 9;
        b = u + 2 * 9;
        d = u + 3 * 9;

        XMEMCPY(u, m, sizeof(sp_digit) * 9);
        XMEMCPY(v, a, sizeof(sp_digit) * 9);

        ut = sp_256_num_bits_9(u);
        vt = sp_256_num_bits_9(v);

        XMEMSET(b, 0, sizeof(sp_digit) * 9);
        if ((v[0] & 1) == 0) {
            sp_256_rshift1_9(v, v);
            XMEMCPY(d, m, sizeof(sp_digit) * 9);
            d[0]++;
            sp_256_rshift1_9(d, d);
            vt--;

            while ((v[0] & 1) == 0) {
                sp_256_rshift1_9(v, v);
                if (d[0] & 1)
                    sp_256_add_9(d, d, m);
                sp_256_rshift1_9(d, d);
                vt--;
            }
        }
        else {
            XMEMSET(d+1, 0, sizeof(sp_digit) * (9 - 1));
            d[0] = 1;
        }

        while (ut > 1 && vt > 1) {
            if ((ut > vt) || ((ut == vt) &&
                    (sp_256_cmp_9(u, v) >= 0))) {
                sp_256_sub_9(u, u, v);
                sp_256_norm_9(u);

                sp_256_sub_9(b, b, d);
                sp_256_norm_9(b);
                if (b[8] < 0)
                    sp_256_add_9(b, b, m);
                sp_256_norm_9(b);
                ut = sp_256_num_bits_9(u);

                do {
                    sp_256_rshift1_9(u, u);
                    if (b[0] & 1)
                        sp_256_add_9(b, b, m);
                    sp_256_rshift1_9(b, b);
                    ut--;
                }
                while (ut > 0 && (u[0] & 1) == 0);
            }
            else {
                sp_256_sub_9(v, v, u);
                sp_256_norm_9(v);

                sp_256_sub_9(d, d, b);
                sp_256_norm_9(d);
                if (d[8] < 0)
                    sp_256_add_9(d, d, m);
                sp_256_norm_9(d);
                vt = sp_256_num_bits_9(v);

                do {
                    sp_256_rshift1_9(v, v);
                    if (d[0] & 1)
                        sp_256_add_9(d, d, m);
                    sp_256_rshift1_9(d, d);
                    vt--;
                }
                while (vt > 0 && (v[0] & 1) == 0);
            }
        }

        if (ut == 1)
            XMEMCPY(r, b, sizeof(sp_digit) * 9);
        else
            XMEMCPY(r, d, sizeof(sp_digit) * 9);
    }
#ifdef WOLFSSL_SP_SMALL_STACK
    if (u != NULL)
        XFREE(u, NULL, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

#endif /* WOLFSSL_SP_SMALL */

/* Add point p1 into point p2. Handles p1 == p2 and result at infinity.
 *
 * p1   First point to add and holds result.
 * p2   Second point to add.
 * tmp  Temporary storage for intermediate numbers.
 */
static void sp_256_add_points_9(sp_point_256* p1, const sp_point_256* p2,
    sp_digit* tmp)
{

        sp_256_proj_point_add_9(p1, p1, p2, tmp);
    if (sp_256_iszero_9(p1->z)) {
        if (sp_256_iszero_9(p1->x) && sp_256_iszero_9(p1->y)) {
                sp_256_proj_point_dbl_9(p1, p2, tmp);
        }
        else {
            /* Y ordinate is not used from here - don't set. */
            p1->x[0] = 0;
            p1->x[1] = 0;
            p1->x[2] = 0;
            p1->x[3] = 0;
            p1->x[4] = 0;
            p1->x[5] = 0;
            p1->x[6] = 0;
            p1->x[7] = 0;
            p1->x[8] = 0;
            XMEMCPY(p1->z, p256_norm_mod, sizeof(p256_norm_mod));
        }
    }
}

/* Calculate the verification point: [e/s]G + [r/s]Q
 *
 * p1    Calculated point.
 * p2    Public point and temporary.
 * s     Second part of signature as a number.
 * u1    Temporary number.
 * u2    Temporary number.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_256_calc_vfy_point_9(sp_point_256* p1, sp_point_256* p2,
    sp_digit* s, sp_digit* u1, sp_digit* u2, sp_digit* tmp, void* heap)
{
    int err;

#ifndef WOLFSSL_SP_SMALL
    err = sp_256_mod_inv_9(s, s, p256_order);
    if (err == MP_OKAY)
#endif /* !WOLFSSL_SP_SMALL */
    {
        sp_256_mul_9(s, s, p256_norm_order);
        err = sp_256_mod_9(s, s, p256_order);
    }
    if (err == MP_OKAY) {
        sp_256_norm_9(s);
#ifdef WOLFSSL_SP_SMALL
        {
            sp_256_mont_inv_order_9(s, s, tmp);
            sp_256_mont_mul_order_9(u1, u1, s);
            sp_256_mont_mul_order_9(u2, u2, s);
        }
#else
        {
            sp_256_mont_mul_order_9(u1, u1, s);
            sp_256_mont_mul_order_9(u2, u2, s);
        }
#endif /* WOLFSSL_SP_SMALL */
        {
            err = sp_256_ecc_mulmod_base_9(p1, u1, 0, 0, heap);
        }
    }
    if ((err == MP_OKAY) && sp_256_iszero_9(p1->z)) {
        p1->infinity = 1;
    }
    if (err == MP_OKAY) {
            err = sp_256_ecc_mulmod_9(p2, p2, u2, 0, 0, heap);
    }
    if ((err == MP_OKAY) && sp_256_iszero_9(p2->z)) {
        p2->infinity = 1;
    }

    if (err == MP_OKAY) {
        sp_256_add_points_9(p1, p2, tmp);
    }

    return err;
}

#ifdef HAVE_ECC_VERIFY
/* Verify the signature values with the hash and public key.
 *   e = Truncate(hash, 256)
 *   u1 = e/s mod order
 *   u2 = r/s mod order
 *   r == (u1.G + u2.Q)->x mod order
 * Optimization: Leave point in projective form.
 *   (x, y, 1) == (x' / z'*z', y' / z'*z'*z', z' / z')
 *   (r + n*order).z'.z' mod prime == (u1.G + u2.Q)->x'
 * The hash is truncated to the first 256 bits.
 *
 * hash     Hash to sign.
 * hashLen  Length of the hash data.
 * rng      Random number generator.
 * priv     Private part of key - scalar.
 * rm       First part of result as an mp_int.
 * sm       Sirst part of result as an mp_int.
 * heap     Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
int sp_ecc_verify_256(const byte* hash, word32 hashLen, const mp_int* pX,
    const mp_int* pY, const mp_int* pZ, const mp_int* rm, const mp_int* sm,
    int* res, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* u1 = NULL;
    sp_point_256* p1 = NULL;
#else
    sp_digit  u1[18 * 9];
    sp_point_256 p1[2];
#endif
    sp_digit* u2 = NULL;
    sp_digit* s = NULL;
    sp_digit* tmp = NULL;
    sp_point_256* p2 = NULL;
    sp_digit carry;
    sp_int32 c = 0;
    int err = MP_OKAY;

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        p1 = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap,
                                             DYNAMIC_TYPE_ECC);
        if (p1 == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        u1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 18 * 9, heap,
                                                              DYNAMIC_TYPE_ECC);
        if (u1 == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        u2  = u1 + 2 * 9;
        s   = u1 + 4 * 9;
        tmp = u1 + 6 * 9;
        p2 = p1 + 1;

        if (hashLen > 32U) {
            hashLen = 32U;
        }

        sp_256_from_bin(u1, 9, hash, (int)hashLen);
        sp_256_from_mp(u2, 9, rm);
        sp_256_from_mp(s, 9, sm);
        sp_256_from_mp(p2->x, 9, pX);
        sp_256_from_mp(p2->y, 9, pY);
        sp_256_from_mp(p2->z, 9, pZ);

        err = sp_256_calc_vfy_point_9(p1, p2, s, u1, u2, tmp, heap);
    }
    if (err == MP_OKAY) {
        /* (r + n*order).z'.z' mod prime == (u1.G + u2.Q)->x' */
        /* Reload r and convert to Montgomery form. */
        sp_256_from_mp(u2, 9, rm);
        err = sp_256_mod_mul_norm_9(u2, u2, p256_mod);
    }

    if (err == MP_OKAY) {
        /* u1 = r.z'.z' mod prime */
            sp_256_mont_sqr_9(p1->z, p1->z, p256_mod, p256_mp_mod);
            sp_256_mont_mul_9(u1, u2, p1->z, p256_mod, p256_mp_mod);
        *res = (int)(sp_256_cmp_9(p1->x, u1) == 0);
        if (*res == 0) {
            /* Reload r and add order. */
            sp_256_from_mp(u2, 9, rm);
            carry = sp_256_add_9(u2, u2, p256_order);
            /* Carry means result is greater than mod and is not valid. */
            if (carry == 0) {
                sp_256_norm_9(u2);

                /* Compare with mod and if greater or equal then not valid. */
                c = sp_256_cmp_9(u2, p256_mod);
            }
        }
        if ((*res == 0) && (c < 0)) {
            /* Convert to Montogomery form */
            err = sp_256_mod_mul_norm_9(u2, u2, p256_mod);
            if (err == MP_OKAY) {
                /* u1 = (r + 1*order).z'.z' mod prime */
                {
                    sp_256_mont_mul_9(u1, u2, p1->z, p256_mod, p256_mp_mod);
                }
                *res = (sp_256_cmp_9(p1->x, u1) == 0);
            }
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (u1 != NULL)
        XFREE(u1, heap, DYNAMIC_TYPE_ECC);
    if (p1 != NULL)
        XFREE(p1, heap, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

#ifdef WOLFSSL_SP_NONBLOCK
typedef struct sp_ecc_verify_256_ctx {
    int state;
    union {
        sp_256_ecc_mulmod_9_ctx mulmod_ctx;
        sp_256_mont_inv_order_9_ctx mont_inv_order_ctx;
        sp_256_proj_point_dbl_9_ctx dbl_ctx;
        sp_256_proj_point_add_9_ctx add_ctx;
    };
    sp_digit u1[2*9];
    sp_digit u2[2*9];
    sp_digit s[2*9];
    sp_digit tmp[2*9 * 6];
    sp_point_256 p1;
    sp_point_256 p2;
} sp_ecc_verify_256_ctx;

int sp_ecc_verify_256_nb(sp_ecc_ctx_t* sp_ctx, const byte* hash,
    word32 hashLen, const mp_int* pX, const mp_int* pY, const mp_int* pZ,
    const mp_int* rm, const mp_int* sm, int* res, void* heap)
{
    int err = FP_WOULDBLOCK;
    sp_ecc_verify_256_ctx* ctx = (sp_ecc_verify_256_ctx*)sp_ctx->data;

    typedef char ctx_size_test[sizeof(sp_ecc_verify_256_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
    (void)sizeof(ctx_size_test);

    switch (ctx->state) {
    case 0: /* INIT */
        if (hashLen > 32U) {
            hashLen = 32U;
        }

        sp_256_from_bin(ctx->u1, 9, hash, (int)hashLen);
        sp_256_from_mp(ctx->u2, 9, rm);
        sp_256_from_mp(ctx->s, 9, sm);
        sp_256_from_mp(ctx->p2.x, 9, pX);
        sp_256_from_mp(ctx->p2.y, 9, pY);
        sp_256_from_mp(ctx->p2.z, 9, pZ);
        ctx->state = 1;
        break;
    case 1: /* NORMS0 */
        sp_256_mul_9(ctx->s, ctx->s, p256_norm_order);
        err = sp_256_mod_9(ctx->s, ctx->s, p256_order);
        if (err == MP_OKAY)
            ctx->state = 2;
        break;
    case 2: /* NORMS1 */
        sp_256_norm_9(ctx->s);
        XMEMSET(&ctx->mont_inv_order_ctx, 0, sizeof(ctx->mont_inv_order_ctx));
        ctx->state = 3;
        break;
    case 3: /* NORMS2 */
        err = sp_256_mont_inv_order_9_nb((sp_ecc_ctx_t*)&ctx->mont_inv_order_ctx, ctx->s, ctx->s, ctx->tmp);
        if (err == MP_OKAY) {
            ctx->state = 4;
        }
        break;
    case 4: /* NORMS3 */
        sp_256_mont_mul_order_9(ctx->u1, ctx->u1, ctx->s);
        ctx->state = 5;
        break;
    case 5: /* NORMS4 */
        sp_256_mont_mul_order_9(ctx->u2, ctx->u2, ctx->s);
        XMEMSET(&ctx->mulmod_ctx, 0, sizeof(ctx->mulmod_ctx));
        ctx->state = 6;
        break;
    case 6: /* MULBASE */
        err = sp_256_ecc_mulmod_9_nb((sp_ecc_ctx_t*)&ctx->mulmod_ctx, &ctx->p1, &p256_base, ctx->u1, 0, 0, heap);
        if (err == MP_OKAY) {
            if (sp_256_iszero_9(ctx->p1.z)) {
                ctx->p1.infinity = 1;
            }
            XMEMSET(&ctx->mulmod_ctx, 0, sizeof(ctx->mulmod_ctx));
            ctx->state = 7;
        }
        break;
    case 7: /* MULMOD */
        err = sp_256_ecc_mulmod_9_nb((sp_ecc_ctx_t*)&ctx->mulmod_ctx, &ctx->p2, &ctx->p2, ctx->u2, 0, 0, heap);
        if (err == MP_OKAY) {
            if (sp_256_iszero_9(ctx->p2.z)) {
                ctx->p2.infinity = 1;
            }
            XMEMSET(&ctx->add_ctx, 0, sizeof(ctx->add_ctx));
            ctx->state = 8;
        }
        break;
    case 8: /* ADD */
        err = sp_256_proj_point_add_9_nb((sp_ecc_ctx_t*)&ctx->add_ctx, &ctx->p1, &ctx->p1, &ctx->p2, ctx->tmp);
        if (err == MP_OKAY)
            ctx->state = 9;
        break;
    case 9: /* MONT */
        /* (r + n*order).z'.z' mod prime == (u1.G + u2.Q)->x' */
        /* Reload r and convert to Montgomery form. */
        sp_256_from_mp(ctx->u2, 9, rm);
        err = sp_256_mod_mul_norm_9(ctx->u2, ctx->u2, p256_mod);
        if (err == MP_OKAY)
            ctx->state = 10;
        break;
    case 10: /* SQR */
        /* u1 = r.z'.z' mod prime */
        sp_256_mont_sqr_9(ctx->p1.z, ctx->p1.z, p256_mod, p256_mp_mod);
        ctx->state = 11;
        break;
    case 11: /* MUL */
        sp_256_mont_mul_9(ctx->u1, ctx->u2, ctx->p1.z, p256_mod, p256_mp_mod);
        ctx->state = 12;
        break;
    case 12: /* RES */
    {
        sp_int32 c = 0;
        err = MP_OKAY; /* math okay, now check result */
        *res = (int)(sp_256_cmp_9(ctx->p1.x, ctx->u1) == 0);
        if (*res == 0) {
            sp_digit carry;

            /* Reload r and add order. */
            sp_256_from_mp(ctx->u2, 9, rm);
            carry = sp_256_add_9(ctx->u2, ctx->u2, p256_order);
            /* Carry means result is greater than mod and is not valid. */
            if (carry == 0) {
                sp_256_norm_9(ctx->u2);

                /* Compare with mod and if greater or equal then not valid. */
                c = sp_256_cmp_9(ctx->u2, p256_mod);
            }
        }
        if ((*res == 0) && (c < 0)) {
            /* Convert to Montogomery form */
            err = sp_256_mod_mul_norm_9(ctx->u2, ctx->u2, p256_mod);
            if (err == MP_OKAY) {
                /* u1 = (r + 1*order).z'.z' mod prime */
                sp_256_mont_mul_9(ctx->u1, ctx->u2, ctx->p1.z, p256_mod,
                                                            p256_mp_mod);
                *res = (int)(sp_256_cmp_9(ctx->p1.x, ctx->u1) == 0);
            }
        }
        break;
    }
    } /* switch */

    if (err == MP_OKAY && ctx->state != 12) {
        err = FP_WOULDBLOCK;
    }

    return err;
}
#endif /* WOLFSSL_SP_NONBLOCK */
#endif /* HAVE_ECC_VERIFY */

#ifdef HAVE_ECC_CHECK_KEY
/* Check that the x and y ordinates are a valid point on the curve.
 *
 * point  EC point.
 * heap   Heap to use if dynamically allocating.
 * returns MEMORY_E if dynamic memory allocation fails, MP_VAL if the point is
 * not on the curve and MP_OKAY otherwise.
 */
static int sp_256_ecc_is_point_9(const sp_point_256* point,
    void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* t1 = NULL;
#else
    sp_digit t1[9 * 4];
#endif
    sp_digit* t2 = NULL;
    int err = MP_OKAY;

#ifdef WOLFSSL_SP_SMALL_STACK
    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 9 * 4, heap, DYNAMIC_TYPE_ECC);
    if (t1 == NULL)
        err = MEMORY_E;
#endif
    (void)heap;

    if (err == MP_OKAY) {
        t2 = t1 + 2 * 9;

        /* y^2 - x^3 - a.x = b */
        sp_256_sqr_9(t1, point->y);
        (void)sp_256_mod_9(t1, t1, p256_mod);
        sp_256_sqr_9(t2, point->x);
        (void)sp_256_mod_9(t2, t2, p256_mod);
        sp_256_mul_9(t2, t2, point->x);
        (void)sp_256_mod_9(t2, t2, p256_mod);
        sp_256_mont_sub_9(t1, t1, t2, p256_mod);

        /* y^2 - x^3 + 3.x = b, when a = -3  */
        sp_256_mont_add_9(t1, t1, point->x, p256_mod);
        sp_256_mont_add_9(t1, t1, point->x, p256_mod);
        sp_256_mont_add_9(t1, t1, point->x, p256_mod);


        if (sp_256_cmp_9(t1, p256_b) != 0) {
            err = MP_VAL;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t1 != NULL)
        XFREE(t1, heap, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

/* Check that the x and y ordinates are a valid point on the curve.
 *
 * pX  X ordinate of EC point.
 * pY  Y ordinate of EC point.
 * returns MEMORY_E if dynamic memory allocation fails, MP_VAL if the point is
 * not on the curve and MP_OKAY otherwise.
 */
int sp_ecc_is_point_256(const mp_int* pX, const mp_int* pY)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_256* pub = NULL;
#else
    sp_point_256 pub[1];
#endif
    const byte one[1] = { 1 };
    int err = MP_OKAY;

#ifdef WOLFSSL_SP_SMALL_STACK
    pub = (sp_point_256*)XMALLOC(sizeof(sp_point_256), NULL,
                                       DYNAMIC_TYPE_ECC);
    if (pub == NULL)
        err = MEMORY_E;
#endif

    if (err == MP_OKAY) {
        sp_256_from_mp(pub->x, 9, pX);
        sp_256_from_mp(pub->y, 9, pY);
        sp_256_from_bin(pub->z, 9, one, (int)sizeof(one));

        err = sp_256_ecc_is_point_9(pub, NULL);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (pub != NULL)
        XFREE(pub, NULL, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

/* Check that the private scalar generates the EC point (px, py), the point is
 * on the curve and the point has the correct order.
 *
 * pX     X ordinate of EC point.
 * pY     Y ordinate of EC point.
 * privm  Private scalar that generates EC point.
 * returns MEMORY_E if dynamic memory allocation fails, MP_VAL if the point is
 * not on the curve, ECC_INF_E if the point does not have the correct order,
 * ECC_PRIV_KEY_E when the private scalar doesn't generate the EC point and
 * MP_OKAY otherwise.
 */
int sp_ecc_check_key_256(const mp_int* pX, const mp_int* pY,
    const mp_int* privm, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* priv = NULL;
    sp_point_256* pub = NULL;
#else
    sp_digit priv[9];
    sp_point_256 pub[2];
#endif
    sp_point_256* p = NULL;
    const byte one[1] = { 1 };
    int err = MP_OKAY;


    /* Quick check the lengs of public key ordinates and private key are in
     * range. Proper check later.
     */
    if (((mp_count_bits(pX) > 256) ||
        (mp_count_bits(pY) > 256) ||
        ((privm != NULL) && (mp_count_bits(privm) > 256)))) {
        err = ECC_OUT_OF_RANGE_E;
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        pub = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, heap,
                                           DYNAMIC_TYPE_ECC);
        if (pub == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY && privm) {
        priv = (sp_digit*)XMALLOC(sizeof(sp_digit) * 9, heap,
                                  DYNAMIC_TYPE_ECC);
        if (priv == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        p = pub + 1;

        sp_256_from_mp(pub->x, 9, pX);
        sp_256_from_mp(pub->y, 9, pY);
        sp_256_from_bin(pub->z, 9, one, (int)sizeof(one));
        if (privm)
            sp_256_from_mp(priv, 9, privm);

        /* Check point at infinitiy. */
        if ((sp_256_iszero_9(pub->x) != 0) &&
            (sp_256_iszero_9(pub->y) != 0)) {
            err = ECC_INF_E;
        }
    }

    /* Check range of X and Y */
    if ((err == MP_OKAY) &&
            ((sp_256_cmp_9(pub->x, p256_mod) >= 0) ||
             (sp_256_cmp_9(pub->y, p256_mod) >= 0))) {
        err = ECC_OUT_OF_RANGE_E;
    }

    if (err == MP_OKAY) {
        /* Check point is on curve */
        err = sp_256_ecc_is_point_9(pub, heap);
    }

    if (err == MP_OKAY) {
        /* Point * order = infinity */
            err = sp_256_ecc_mulmod_9(p, pub, p256_order, 1, 1, heap);
    }
    /* Check result is infinity */
    if ((err == MP_OKAY) && ((sp_256_iszero_9(p->x) == 0) ||
                             (sp_256_iszero_9(p->y) == 0))) {
        err = ECC_INF_E;
    }

    if (privm) {
        if (err == MP_OKAY) {
            /* Base * private = point */
                err = sp_256_ecc_mulmod_base_9(p, priv, 1, 1, heap);
        }
        /* Check result is public key */
        if ((err == MP_OKAY) &&
                ((sp_256_cmp_9(p->x, pub->x) != 0) ||
                 (sp_256_cmp_9(p->y, pub->y) != 0))) {
            err = ECC_PRIV_KEY_E;
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (pub != NULL)
        XFREE(pub, heap, DYNAMIC_TYPE_ECC);
    if (priv != NULL)
        XFREE(priv, heap, DYNAMIC_TYPE_ECC);
#endif

    return err;
}
#endif
#ifdef WOLFSSL_PUBLIC_ECC_ADD_DBL
/* Add two projective EC points together.
 * (pX, pY, pZ) + (qX, qY, qZ) = (rX, rY, rZ)
 *
 * pX   First EC point's X ordinate.
 * pY   First EC point's Y ordinate.
 * pZ   First EC point's Z ordinate.
 * qX   Second EC point's X ordinate.
 * qY   Second EC point's Y ordinate.
 * qZ   Second EC point's Z ordinate.
 * rX   Resultant EC point's X ordinate.
 * rY   Resultant EC point's Y ordinate.
 * rZ   Resultant EC point's Z ordinate.
 * returns MEMORY_E if dynamic memory allocation fails and MP_OKAY otherwise.
 */
int sp_ecc_proj_add_point_256(mp_int* pX, mp_int* pY, mp_int* pZ,
                              mp_int* qX, mp_int* qY, mp_int* qZ,
                              mp_int* rX, mp_int* rY, mp_int* rZ)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* tmp = NULL;
    sp_point_256* p = NULL;
#else
    sp_digit tmp[2 * 9 * 6];
    sp_point_256 p[2];
#endif
    sp_point_256* q = NULL;
    int err = MP_OKAY;

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        p = (sp_point_256*)XMALLOC(sizeof(sp_point_256) * 2, NULL,
                                         DYNAMIC_TYPE_ECC);
        if (p == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 6, NULL,
                                 DYNAMIC_TYPE_ECC);
        if (tmp == NULL) {
            err = MEMORY_E;
        }
    }
#endif

    if (err == MP_OKAY) {
        q = p + 1;

        sp_256_from_mp(p->x, 9, pX);
        sp_256_from_mp(p->y, 9, pY);
        sp_256_from_mp(p->z, 9, pZ);
        sp_256_from_mp(q->x, 9, qX);
        sp_256_from_mp(q->y, 9, qY);
        sp_256_from_mp(q->z, 9, qZ);
        p->infinity = sp_256_iszero_9(p->x) &
                      sp_256_iszero_9(p->y);
        q->infinity = sp_256_iszero_9(q->x) &
                      sp_256_iszero_9(q->y);

            sp_256_proj_point_add_9(p, p, q, tmp);
    }

    if (err == MP_OKAY) {
        err = sp_256_to_mp(p->x, rX);
    }
    if (err == MP_OKAY) {
        err = sp_256_to_mp(p->y, rY);
    }
    if (err == MP_OKAY) {
        err = sp_256_to_mp(p->z, rZ);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (tmp != NULL)
        XFREE(tmp, NULL, DYNAMIC_TYPE_ECC);
    if (p != NULL)
        XFREE(p, NULL, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

/* Double a projective EC point.
 * (pX, pY, pZ) + (pX, pY, pZ) = (rX, rY, rZ)
 *
 * pX   EC point's X ordinate.
 * pY   EC point's Y ordinate.
 * pZ   EC point's Z ordinate.
 * rX   Resultant EC point's X ordinate.
 * rY   Resultant EC point's Y ordinate.
 * rZ   Resultant EC point's Z ordinate.
 * returns MEMORY_E if dynamic memory allocation fails and MP_OKAY otherwise.
 */
int sp_ecc_proj_dbl_point_256(mp_int* pX, mp_int* pY, mp_int* pZ,
                              mp_int* rX, mp_int* rY, mp_int* rZ)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* tmp = NULL;
    sp_point_256* p = NULL;
#else
    sp_digit tmp[2 * 9 * 2];
    sp_point_256 p[1];
#endif
    int err = MP_OKAY;

#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        p = (sp_point_256*)XMALLOC(sizeof(sp_point_256), NULL,
                                         DYNAMIC_TYPE_ECC);
        if (p == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 2, NULL,
                                 DYNAMIC_TYPE_ECC);
        if (tmp == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        sp_256_from_mp(p->x, 9, pX);
        sp_256_from_mp(p->y, 9, pY);
        sp_256_from_mp(p->z, 9, pZ);
        p->infinity = sp_256_iszero_9(p->x) &
                      sp_256_iszero_9(p->y);

            sp_256_proj_point_dbl_9(p, p, tmp);
    }

    if (err == MP_OKAY) {
        err = sp_256_to_mp(p->x, rX);
    }
    if (err == MP_OKAY) {
        err = sp_256_to_mp(p->y, rY);
    }
    if (err == MP_OKAY) {
        err = sp_256_to_mp(p->z, rZ);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (tmp != NULL)
        XFREE(tmp, NULL, DYNAMIC_TYPE_ECC);
    if (p != NULL)
        XFREE(p, NULL, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

/* Map a projective EC point to affine in place.
 * pZ will be one.
 *
 * pX   EC point's X ordinate.
 * pY   EC point's Y ordinate.
 * pZ   EC point's Z ordinate.
 * returns MEMORY_E if dynamic memory allocation fails and MP_OKAY otherwise.
 */
int sp_ecc_map_256(mp_int* pX, mp_int* pY, mp_int* pZ)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* tmp = NULL;
    sp_point_256* p = NULL;
#else
    sp_digit tmp[2 * 9 * 4];
    sp_point_256 p[1];
#endif
    int err = MP_OKAY;


#ifdef WOLFSSL_SP_SMALL_STACK
    if (err == MP_OKAY) {
        p = (sp_point_256*)XMALLOC(sizeof(sp_point_256), NULL,
                                         DYNAMIC_TYPE_ECC);
        if (p == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 9 * 4, NULL,
                                 DYNAMIC_TYPE_ECC);
        if (tmp == NULL)
            err = MEMORY_E;
    }
#endif
    if (err == MP_OKAY) {
        sp_256_from_mp(p->x, 9, pX);
        sp_256_from_mp(p->y, 9, pY);
        sp_256_from_mp(p->z, 9, pZ);
        p->infinity = sp_256_iszero_9(p->x) &
                      sp_256_iszero_9(p->y);

            sp_256_map_9(p, p, tmp);
    }

    if (err == MP_OKAY) {
        err = sp_256_to_mp(p->x, pX);
    }
    if (err == MP_OKAY) {
        err = sp_256_to_mp(p->y, pY);
    }
    if (err == MP_OKAY) {
        err = sp_256_to_mp(p->z, pZ);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (tmp != NULL)
        XFREE(tmp, NULL, DYNAMIC_TYPE_ECC);
    if (p != NULL)
        XFREE(p, NULL, DYNAMIC_TYPE_ECC);
#endif

    return err;
}
#endif /* WOLFSSL_PUBLIC_ECC_ADD_DBL */
#ifdef HAVE_COMP_KEY
/* Find the square root of a number mod the prime of the curve.
 *
 * y  The number to operate on and the result.
 * returns MEMORY_E if dynamic memory allocation fails and MP_OKAY otherwise.
 */
static int sp_256_mont_sqrt_9(sp_digit* y)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* t1 = NULL;
#else
    sp_digit t1[4 * 9];
#endif
    sp_digit* t2 = NULL;
    int err = MP_OKAY;

#ifdef WOLFSSL_SP_SMALL_STACK
    t1 = (sp_digit*)XMALLOC(sizeof(sp_digit) * 4 * 9, NULL, DYNAMIC_TYPE_ECC);
    if (t1 == NULL) {
        err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        t2 = t1 + 2 * 9;

        {
            /* t2 = y ^ 0x2 */
            sp_256_mont_sqr_9(t2, y, p256_mod, p256_mp_mod);
            /* t1 = y ^ 0x3 */
            sp_256_mont_mul_9(t1, t2, y, p256_mod, p256_mp_mod);
            /* t2 = y ^ 0xc */
            sp_256_mont_sqr_n_9(t2, t1, 2, p256_mod, p256_mp_mod);
            /* t1 = y ^ 0xf */
            sp_256_mont_mul_9(t1, t1, t2, p256_mod, p256_mp_mod);
            /* t2 = y ^ 0xf0 */
            sp_256_mont_sqr_n_9(t2, t1, 4, p256_mod, p256_mp_mod);
            /* t1 = y ^ 0xff */
            sp_256_mont_mul_9(t1, t1, t2, p256_mod, p256_mp_mod);
            /* t2 = y ^ 0xff00 */
            sp_256_mont_sqr_n_9(t2, t1, 8, p256_mod, p256_mp_mod);
            /* t1 = y ^ 0xffff */
            sp_256_mont_mul_9(t1, t1, t2, p256_mod, p256_mp_mod);
            /* t2 = y ^ 0xffff0000 */
            sp_256_mont_sqr_n_9(t2, t1, 16, p256_mod, p256_mp_mod);
            /* t1 = y ^ 0xffffffff */
            sp_256_mont_mul_9(t1, t1, t2, p256_mod, p256_mp_mod);
            /* t1 = y ^ 0xffffffff00000000 */
            sp_256_mont_sqr_n_9(t1, t1, 32, p256_mod, p256_mp_mod);
            /* t1 = y ^ 0xffffffff00000001 */
            sp_256_mont_mul_9(t1, t1, y, p256_mod, p256_mp_mod);
            /* t1 = y ^ 0xffffffff00000001000000000000000000000000 */
            sp_256_mont_sqr_n_9(t1, t1, 96, p256_mod, p256_mp_mod);
            /* t1 = y ^ 0xffffffff00000001000000000000000000000001 */
            sp_256_mont_mul_9(t1, t1, y, p256_mod, p256_mp_mod);
            sp_256_mont_sqr_n_9(y, t1, 94, p256_mod, p256_mp_mod);
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t1 != NULL)
        XFREE(t1, NULL, DYNAMIC_TYPE_ECC);
#endif

    return err;
}


/* Uncompress the point given the X ordinate.
 *
 * xm    X ordinate.
 * odd   Whether the Y ordinate is odd.
 * ym    Calculated Y ordinate.
 * returns MEMORY_E if dynamic memory allocation fails and MP_OKAY otherwise.
 */
int sp_ecc_uncompress_256(mp_int* xm, int odd, mp_int* ym)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* x = NULL;
#else
    sp_digit x[4 * 9];
#endif
    sp_digit* y = NULL;
    int err = MP_OKAY;

#ifdef WOLFSSL_SP_SMALL_STACK
    x = (sp_digit*)XMALLOC(sizeof(sp_digit) * 4 * 9, NULL, DYNAMIC_TYPE_ECC);
    if (x == NULL)
        err = MEMORY_E;
#endif

    if (err == MP_OKAY) {
        y = x + 2 * 9;

        sp_256_from_mp(x, 9, xm);
        err = sp_256_mod_mul_norm_9(x, x, p256_mod);
    }
    if (err == MP_OKAY) {
        /* y = x^3 */
        {
            sp_256_mont_sqr_9(y, x, p256_mod, p256_mp_mod);
            sp_256_mont_mul_9(y, y, x, p256_mod, p256_mp_mod);
        }
        /* y = x^3 - 3x */
        sp_256_mont_sub_9(y, y, x, p256_mod);
        sp_256_mont_sub_9(y, y, x, p256_mod);
        sp_256_mont_sub_9(y, y, x, p256_mod);
        /* y = x^3 - 3x + b */
        err = sp_256_mod_mul_norm_9(x, p256_b, p256_mod);
    }
    if (err == MP_OKAY) {
        sp_256_mont_add_9(y, y, x, p256_mod);
        /* y = sqrt(x^3 - 3x + b) */
        err = sp_256_mont_sqrt_9(y);
    }
    if (err == MP_OKAY) {
        XMEMSET(y + 9, 0, 9U * sizeof(sp_digit));
        sp_256_mont_reduce_9(y, p256_mod, p256_mp_mod);
        if ((((word32)y[0] ^ (word32)odd) & 1U) != 0U) {
            sp_256_mont_sub_9(y, p256_mod, y, p256_mod);
        }

        err = sp_256_to_mp(y, ym);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (x != NULL)
        XFREE(x, NULL, DYNAMIC_TYPE_ECC);
#endif

    return err;
}
#endif
#endif /* !WOLFSSL_SP_NO_256 */
#ifdef WOLFSSL_SP_384

/* Point structure to use. */
typedef struct sp_point_384 {
    /* X ordinate of point. */
    sp_digit x[2 * 15];
    /* Y ordinate of point. */
    sp_digit y[2 * 15];
    /* Z ordinate of point. */
    sp_digit z[2 * 15];
    /* Indicates point is at infinity. */
    int infinity;
} sp_point_384;

/* The modulus (prime) of the curve P384. */
static const sp_digit p384_mod[15] = {
    0x3ffffff,0x000003f,0x0000000,0x3fc0000,0x2ffffff,0x3ffffff,0x3ffffff,
    0x3ffffff,0x3ffffff,0x3ffffff,0x3ffffff,0x3ffffff,0x3ffffff,0x3ffffff,
    0x00fffff
};
/* The Montgomery normalizer for modulus of the curve P384. */
static const sp_digit p384_norm_mod[15] = {
    0x0000001,0x3ffffc0,0x3ffffff,0x003ffff,0x1000000,0x0000000,0x0000000,
    0x0000000,0x0000000,0x0000000,0x0000000,0x0000000,0x0000000,0x0000000,
    0x0000000
};
/* The Montgomery multiplier for modulus of the curve P384. */
static sp_digit p384_mp_mod = 0x000001;
#if defined(WOLFSSL_VALIDATE_ECC_KEYGEN) || defined(HAVE_ECC_SIGN) || \
                                            defined(HAVE_ECC_VERIFY)
/* The order of the curve P384. */
static const sp_digit p384_order[15] = {
    0x0c52973,0x3065ab3,0x277aece,0x2c922c2,0x3581a0d,0x10dcb77,0x234d81f,
    0x3ffff1d,0x3ffffff,0x3ffffff,0x3ffffff,0x3ffffff,0x3ffffff,0x3ffffff,
    0x00fffff
};
#endif
/* The order of the curve P384 minus 2. */
static const sp_digit p384_order2[15] = {
    0x0c52971,0x3065ab3,0x277aece,0x2c922c2,0x3581a0d,0x10dcb77,0x234d81f,
    0x3ffff1d,0x3ffffff,0x3ffffff,0x3ffffff,0x3ffffff,0x3ffffff,0x3ffffff,
    0x00fffff
};
#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
/* The Montgomery normalizer for order of the curve P384. */
static const sp_digit p384_norm_order[15] = {
    0x33ad68d,0x0f9a54c,0x1885131,0x136dd3d,0x0a7e5f2,0x2f23488,0x1cb27e0,
    0x00000e2,0x0000000,0x0000000,0x0000000,0x0000000,0x0000000,0x0000000,
    0x0000000
};
#endif
#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
/* The Montgomery multiplier for order of the curve P384. */
static sp_digit p384_mp_order = 0x8fdc45;
#endif
/* The base point of curve P384. */
static const sp_point_384 p384_base = {
    /* X ordinate */
    {
        0x2760ab7,0x1178e1c,0x296c3a5,0x176fd54,0x05502f2,0x0950a8e,0x3741e08,
        0x26e6167,0x3628ba7,0x11b874e,0x3320ad7,0x2c71c7b,0x305378e,0x288afa2,
        0x00aa87c,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0
    },
    /* Y ordinate */
    {
        0x0ea0e5f,0x0c75f24,0x019d7a4,0x33875fa,0x00a60b1,0x17c2e30,0x1a3113b,
        0x051f3a7,0x1bd289a,0x27e3d07,0x1292dc2,0x27a62fe,0x22c6f5d,0x392a589,
        0x003617d,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0
    },
    /* Z ordinate */
    {
        0x0000001,0x0000000,0x0000000,0x0000000,0x0000000,0x0000000,0x0000000,
        0x0000000,0x0000000,0x0000000,0x0000000,0x0000000,0x0000000,0x0000000,
        0x0000000,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0,
        (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0, (sp_digit)0
    },
    /* infinity */
    0
};
#if defined(HAVE_ECC_CHECK_KEY) || defined(HAVE_COMP_KEY)
static const sp_digit p384_b[15] = {
    0x3ec2aef,0x1723b74,0x119d2a8,0x23628bb,0x2c65639,0x004e1d6,0x14088f5,
    0x104480c,0x06efe81,0x2460767,0x23f82d1,0x23815af,0x2e7e498,0x3e9f88f,
    0x00b3312
};
#endif

#ifdef WOLFSSL_SP_SMALL
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_384_mul_15(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    int i;
    int imax;
    int k;
    sp_uint64 c;
    sp_uint64 lo;

    c = ((sp_uint64)a[14]) * b[14];
    r[29] = (sp_digit)(c >> 26);
    c &= 0x3ffffff;
    for (k = 27; k >= 0; k--) {
        if (k >= 15) {
            i = k - 14;
            imax = 14;
        }
        else {
            i = 0;
            imax = k;
        }
        lo = 0;
        for (; i <= imax; i++) {
            lo += ((sp_uint64)a[i]) * b[k - i];
        }
        c += lo >> 26;
        r[k + 2] += (sp_digit)(c >> 26);
        r[k + 1]  = (sp_digit)(c & 0x3ffffff);
        c = lo & 0x3ffffff;
    }
    r[0] = (sp_digit)c;
}

#else
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_384_mul_15(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    sp_int64 t0;
    sp_int64 t1;
    sp_digit t[15];

    t0 = ((sp_int64)a[ 0]) * b[ 0];
    t1 = ((sp_int64)a[ 0]) * b[ 1]
       + ((sp_int64)a[ 1]) * b[ 0];
    t[ 0] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_int64)a[ 0]) * b[ 2]
       + ((sp_int64)a[ 1]) * b[ 1]
       + ((sp_int64)a[ 2]) * b[ 0];
    t[ 1] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_int64)a[ 0]) * b[ 3]
       + ((sp_int64)a[ 1]) * b[ 2]
       + ((sp_int64)a[ 2]) * b[ 1]
       + ((sp_int64)a[ 3]) * b[ 0];
    t[ 2] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_int64)a[ 0]) * b[ 4]
       + ((sp_int64)a[ 1]) * b[ 3]
       + ((sp_int64)a[ 2]) * b[ 2]
       + ((sp_int64)a[ 3]) * b[ 1]
       + ((sp_int64)a[ 4]) * b[ 0];
    t[ 3] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_int64)a[ 0]) * b[ 5]
       + ((sp_int64)a[ 1]) * b[ 4]
       + ((sp_int64)a[ 2]) * b[ 3]
       + ((sp_int64)a[ 3]) * b[ 2]
       + ((sp_int64)a[ 4]) * b[ 1]
       + ((sp_int64)a[ 5]) * b[ 0];
    t[ 4] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_int64)a[ 0]) * b[ 6]
       + ((sp_int64)a[ 1]) * b[ 5]
       + ((sp_int64)a[ 2]) * b[ 4]
       + ((sp_int64)a[ 3]) * b[ 3]
       + ((sp_int64)a[ 4]) * b[ 2]
       + ((sp_int64)a[ 5]) * b[ 1]
       + ((sp_int64)a[ 6]) * b[ 0];
    t[ 5] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_int64)a[ 0]) * b[ 7]
       + ((sp_int64)a[ 1]) * b[ 6]
       + ((sp_int64)a[ 2]) * b[ 5]
       + ((sp_int64)a[ 3]) * b[ 4]
       + ((sp_int64)a[ 4]) * b[ 3]
       + ((sp_int64)a[ 5]) * b[ 2]
       + ((sp_int64)a[ 6]) * b[ 1]
       + ((sp_int64)a[ 7]) * b[ 0];
    t[ 6] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_int64)a[ 0]) * b[ 8]
       + ((sp_int64)a[ 1]) * b[ 7]
       + ((sp_int64)a[ 2]) * b[ 6]
       + ((sp_int64)a[ 3]) * b[ 5]
       + ((sp_int64)a[ 4]) * b[ 4]
       + ((sp_int64)a[ 5]) * b[ 3]
       + ((sp_int64)a[ 6]) * b[ 2]
       + ((sp_int64)a[ 7]) * b[ 1]
       + ((sp_int64)a[ 8]) * b[ 0];
    t[ 7] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_int64)a[ 0]) * b[ 9]
       + ((sp_int64)a[ 1]) * b[ 8]
       + ((sp_int64)a[ 2]) * b[ 7]
       + ((sp_int64)a[ 3]) * b[ 6]
       + ((sp_int64)a[ 4]) * b[ 5]
       + ((sp_int64)a[ 5]) * b[ 4]
       + ((sp_int64)a[ 6]) * b[ 3]
       + ((sp_int64)a[ 7]) * b[ 2]
       + ((sp_int64)a[ 8]) * b[ 1]
       + ((sp_int64)a[ 9]) * b[ 0];
    t[ 8] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_int64)a[ 0]) * b[10]
       + ((sp_int64)a[ 1]) * b[ 9]
       + ((sp_int64)a[ 2]) * b[ 8]
       + ((sp_int64)a[ 3]) * b[ 7]
       + ((sp_int64)a[ 4]) * b[ 6]
       + ((sp_int64)a[ 5]) * b[ 5]
       + ((sp_int64)a[ 6]) * b[ 4]
       + ((sp_int64)a[ 7]) * b[ 3]
       + ((sp_int64)a[ 8]) * b[ 2]
       + ((sp_int64)a[ 9]) * b[ 1]
       + ((sp_int64)a[10]) * b[ 0];
    t[ 9] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_int64)a[ 0]) * b[11]
       + ((sp_int64)a[ 1]) * b[10]
       + ((sp_int64)a[ 2]) * b[ 9]
       + ((sp_int64)a[ 3]) * b[ 8]
       + ((sp_int64)a[ 4]) * b[ 7]
       + ((sp_int64)a[ 5]) * b[ 6]
       + ((sp_int64)a[ 6]) * b[ 5]
       + ((sp_int64)a[ 7]) * b[ 4]
       + ((sp_int64)a[ 8]) * b[ 3]
       + ((sp_int64)a[ 9]) * b[ 2]
       + ((sp_int64)a[10]) * b[ 1]
       + ((sp_int64)a[11]) * b[ 0];
    t[10] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_int64)a[ 0]) * b[12]
       + ((sp_int64)a[ 1]) * b[11]
       + ((sp_int64)a[ 2]) * b[10]
       + ((sp_int64)a[ 3]) * b[ 9]
       + ((sp_int64)a[ 4]) * b[ 8]
       + ((sp_int64)a[ 5]) * b[ 7]
       + ((sp_int64)a[ 6]) * b[ 6]
       + ((sp_int64)a[ 7]) * b[ 5]
       + ((sp_int64)a[ 8]) * b[ 4]
       + ((sp_int64)a[ 9]) * b[ 3]
       + ((sp_int64)a[10]) * b[ 2]
       + ((sp_int64)a[11]) * b[ 1]
       + ((sp_int64)a[12]) * b[ 0];
    t[11] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_int64)a[ 0]) * b[13]
       + ((sp_int64)a[ 1]) * b[12]
       + ((sp_int64)a[ 2]) * b[11]
       + ((sp_int64)a[ 3]) * b[10]
       + ((sp_int64)a[ 4]) * b[ 9]
       + ((sp_int64)a[ 5]) * b[ 8]
       + ((sp_int64)a[ 6]) * b[ 7]
       + ((sp_int64)a[ 7]) * b[ 6]
       + ((sp_int64)a[ 8]) * b[ 5]
       + ((sp_int64)a[ 9]) * b[ 4]
       + ((sp_int64)a[10]) * b[ 3]
       + ((sp_int64)a[11]) * b[ 2]
       + ((sp_int64)a[12]) * b[ 1]
       + ((sp_int64)a[13]) * b[ 0];
    t[12] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_int64)a[ 0]) * b[14]
       + ((sp_int64)a[ 1]) * b[13]
       + ((sp_int64)a[ 2]) * b[12]
       + ((sp_int64)a[ 3]) * b[11]
       + ((sp_int64)a[ 4]) * b[10]
       + ((sp_int64)a[ 5]) * b[ 9]
       + ((sp_int64)a[ 6]) * b[ 8]
       + ((sp_int64)a[ 7]) * b[ 7]
       + ((sp_int64)a[ 8]) * b[ 6]
       + ((sp_int64)a[ 9]) * b[ 5]
       + ((sp_int64)a[10]) * b[ 4]
       + ((sp_int64)a[11]) * b[ 3]
       + ((sp_int64)a[12]) * b[ 2]
       + ((sp_int64)a[13]) * b[ 1]
       + ((sp_int64)a[14]) * b[ 0];
    t[13] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_int64)a[ 1]) * b[14]
       + ((sp_int64)a[ 2]) * b[13]
       + ((sp_int64)a[ 3]) * b[12]
       + ((sp_int64)a[ 4]) * b[11]
       + ((sp_int64)a[ 5]) * b[10]
       + ((sp_int64)a[ 6]) * b[ 9]
       + ((sp_int64)a[ 7]) * b[ 8]
       + ((sp_int64)a[ 8]) * b[ 7]
       + ((sp_int64)a[ 9]) * b[ 6]
       + ((sp_int64)a[10]) * b[ 5]
       + ((sp_int64)a[11]) * b[ 4]
       + ((sp_int64)a[12]) * b[ 3]
       + ((sp_int64)a[13]) * b[ 2]
       + ((sp_int64)a[14]) * b[ 1];
    t[14] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_int64)a[ 2]) * b[14]
       + ((sp_int64)a[ 3]) * b[13]
       + ((sp_int64)a[ 4]) * b[12]
       + ((sp_int64)a[ 5]) * b[11]
       + ((sp_int64)a[ 6]) * b[10]
       + ((sp_int64)a[ 7]) * b[ 9]
       + ((sp_int64)a[ 8]) * b[ 8]
       + ((sp_int64)a[ 9]) * b[ 7]
       + ((sp_int64)a[10]) * b[ 6]
       + ((sp_int64)a[11]) * b[ 5]
       + ((sp_int64)a[12]) * b[ 4]
       + ((sp_int64)a[13]) * b[ 3]
       + ((sp_int64)a[14]) * b[ 2];
    r[15] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_int64)a[ 3]) * b[14]
       + ((sp_int64)a[ 4]) * b[13]
       + ((sp_int64)a[ 5]) * b[12]
       + ((sp_int64)a[ 6]) * b[11]
       + ((sp_int64)a[ 7]) * b[10]
       + ((sp_int64)a[ 8]) * b[ 9]
       + ((sp_int64)a[ 9]) * b[ 8]
       + ((sp_int64)a[10]) * b[ 7]
       + ((sp_int64)a[11]) * b[ 6]
       + ((sp_int64)a[12]) * b[ 5]
       + ((sp_int64)a[13]) * b[ 4]
       + ((sp_int64)a[14]) * b[ 3];
    r[16] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_int64)a[ 4]) * b[14]
       + ((sp_int64)a[ 5]) * b[13]
       + ((sp_int64)a[ 6]) * b[12]
       + ((sp_int64)a[ 7]) * b[11]
       + ((sp_int64)a[ 8]) * b[10]
       + ((sp_int64)a[ 9]) * b[ 9]
       + ((sp_int64)a[10]) * b[ 8]
       + ((sp_int64)a[11]) * b[ 7]
       + ((sp_int64)a[12]) * b[ 6]
       + ((sp_int64)a[13]) * b[ 5]
       + ((sp_int64)a[14]) * b[ 4];
    r[17] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_int64)a[ 5]) * b[14]
       + ((sp_int64)a[ 6]) * b[13]
       + ((sp_int64)a[ 7]) * b[12]
       + ((sp_int64)a[ 8]) * b[11]
       + ((sp_int64)a[ 9]) * b[10]
       + ((sp_int64)a[10]) * b[ 9]
       + ((sp_int64)a[11]) * b[ 8]
       + ((sp_int64)a[12]) * b[ 7]
       + ((sp_int64)a[13]) * b[ 6]
       + ((sp_int64)a[14]) * b[ 5];
    r[18] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_int64)a[ 6]) * b[14]
       + ((sp_int64)a[ 7]) * b[13]
       + ((sp_int64)a[ 8]) * b[12]
       + ((sp_int64)a[ 9]) * b[11]
       + ((sp_int64)a[10]) * b[10]
       + ((sp_int64)a[11]) * b[ 9]
       + ((sp_int64)a[12]) * b[ 8]
       + ((sp_int64)a[13]) * b[ 7]
       + ((sp_int64)a[14]) * b[ 6];
    r[19] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_int64)a[ 7]) * b[14]
       + ((sp_int64)a[ 8]) * b[13]
       + ((sp_int64)a[ 9]) * b[12]
       + ((sp_int64)a[10]) * b[11]
       + ((sp_int64)a[11]) * b[10]
       + ((sp_int64)a[12]) * b[ 9]
       + ((sp_int64)a[13]) * b[ 8]
       + ((sp_int64)a[14]) * b[ 7];
    r[20] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_int64)a[ 8]) * b[14]
       + ((sp_int64)a[ 9]) * b[13]
       + ((sp_int64)a[10]) * b[12]
       + ((sp_int64)a[11]) * b[11]
       + ((sp_int64)a[12]) * b[10]
       + ((sp_int64)a[13]) * b[ 9]
       + ((sp_int64)a[14]) * b[ 8];
    r[21] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_int64)a[ 9]) * b[14]
       + ((sp_int64)a[10]) * b[13]
       + ((sp_int64)a[11]) * b[12]
       + ((sp_int64)a[12]) * b[11]
       + ((sp_int64)a[13]) * b[10]
       + ((sp_int64)a[14]) * b[ 9];
    r[22] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_int64)a[10]) * b[14]
       + ((sp_int64)a[11]) * b[13]
       + ((sp_int64)a[12]) * b[12]
       + ((sp_int64)a[13]) * b[11]
       + ((sp_int64)a[14]) * b[10];
    r[23] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_int64)a[11]) * b[14]
       + ((sp_int64)a[12]) * b[13]
       + ((sp_int64)a[13]) * b[12]
       + ((sp_int64)a[14]) * b[11];
    r[24] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_int64)a[12]) * b[14]
       + ((sp_int64)a[13]) * b[13]
       + ((sp_int64)a[14]) * b[12];
    r[25] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = ((sp_int64)a[13]) * b[14]
       + ((sp_int64)a[14]) * b[13];
    r[26] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = ((sp_int64)a[14]) * b[14];
    r[27] = t1 & 0x3ffffff; t0 += t1 >> 26;
    r[28] = t0 & 0x3ffffff;
    r[29] = (sp_digit)(t0 >> 26);
    XMEMCPY(r, t, sizeof(t));
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_384_sqr_15(sp_digit* r, const sp_digit* a)
{
    int i;
    int imax;
    int k;
    sp_uint64 c;
    sp_uint64 t;

    c = ((sp_uint64)a[14]) * a[14];
    r[29] = (sp_digit)(c >> 26);
    c = (c & 0x3ffffff) << 26;
    for (k = 27; k >= 0; k--) {
        i = (k + 1) / 2;
        if ((k & 1) == 0) {
           c += ((sp_uint64)a[i]) * a[i];
           i++;
        }
        if (k < 14) {
            imax = k;
        }
        else {
            imax = 14;
        }
        t = 0;
        for (; i <= imax; i++) {
            t += ((sp_uint64)a[i]) * a[k - i];
        }
        c += t * 2;

        r[k + 2] += (sp_digit) (c >> 52);
        r[k + 1]  = (sp_digit)((c >> 26) & 0x3ffffff);
        c = (c & 0x3ffffff) << 26;
    }
    r[0] = (sp_digit)(c >> 26);
}

#else
/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_384_sqr_15(sp_digit* r, const sp_digit* a)
{
    sp_int64 t0;
    sp_int64 t1;
    sp_digit t[15];

    t0 =  ((sp_int64)a[ 0]) * a[ 0];
    t1 = (((sp_int64)a[ 0]) * a[ 1]) * 2;
    t[ 0] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_int64)a[ 0]) * a[ 2]) * 2
       +  ((sp_int64)a[ 1]) * a[ 1];
    t[ 1] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_int64)a[ 0]) * a[ 3]
       +  ((sp_int64)a[ 1]) * a[ 2]) * 2;
    t[ 2] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_int64)a[ 0]) * a[ 4]
       +  ((sp_int64)a[ 1]) * a[ 3]) * 2
       +  ((sp_int64)a[ 2]) * a[ 2];
    t[ 3] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_int64)a[ 0]) * a[ 5]
       +  ((sp_int64)a[ 1]) * a[ 4]
       +  ((sp_int64)a[ 2]) * a[ 3]) * 2;
    t[ 4] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_int64)a[ 0]) * a[ 6]
       +  ((sp_int64)a[ 1]) * a[ 5]
       +  ((sp_int64)a[ 2]) * a[ 4]) * 2
       +  ((sp_int64)a[ 3]) * a[ 3];
    t[ 5] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_int64)a[ 0]) * a[ 7]
       +  ((sp_int64)a[ 1]) * a[ 6]
       +  ((sp_int64)a[ 2]) * a[ 5]
       +  ((sp_int64)a[ 3]) * a[ 4]) * 2;
    t[ 6] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_int64)a[ 0]) * a[ 8]
       +  ((sp_int64)a[ 1]) * a[ 7]
       +  ((sp_int64)a[ 2]) * a[ 6]
       +  ((sp_int64)a[ 3]) * a[ 5]) * 2
       +  ((sp_int64)a[ 4]) * a[ 4];
    t[ 7] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_int64)a[ 0]) * a[ 9]
       +  ((sp_int64)a[ 1]) * a[ 8]
       +  ((sp_int64)a[ 2]) * a[ 7]
       +  ((sp_int64)a[ 3]) * a[ 6]
       +  ((sp_int64)a[ 4]) * a[ 5]) * 2;
    t[ 8] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_int64)a[ 0]) * a[10]
       +  ((sp_int64)a[ 1]) * a[ 9]
       +  ((sp_int64)a[ 2]) * a[ 8]
       +  ((sp_int64)a[ 3]) * a[ 7]
       +  ((sp_int64)a[ 4]) * a[ 6]) * 2
       +  ((sp_int64)a[ 5]) * a[ 5];
    t[ 9] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_int64)a[ 0]) * a[11]
       +  ((sp_int64)a[ 1]) * a[10]
       +  ((sp_int64)a[ 2]) * a[ 9]
       +  ((sp_int64)a[ 3]) * a[ 8]
       +  ((sp_int64)a[ 4]) * a[ 7]
       +  ((sp_int64)a[ 5]) * a[ 6]) * 2;
    t[10] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_int64)a[ 0]) * a[12]
       +  ((sp_int64)a[ 1]) * a[11]
       +  ((sp_int64)a[ 2]) * a[10]
       +  ((sp_int64)a[ 3]) * a[ 9]
       +  ((sp_int64)a[ 4]) * a[ 8]
       +  ((sp_int64)a[ 5]) * a[ 7]) * 2
       +  ((sp_int64)a[ 6]) * a[ 6];
    t[11] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_int64)a[ 0]) * a[13]
       +  ((sp_int64)a[ 1]) * a[12]
       +  ((sp_int64)a[ 2]) * a[11]
       +  ((sp_int64)a[ 3]) * a[10]
       +  ((sp_int64)a[ 4]) * a[ 9]
       +  ((sp_int64)a[ 5]) * a[ 8]
       +  ((sp_int64)a[ 6]) * a[ 7]) * 2;
    t[12] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_int64)a[ 0]) * a[14]
       +  ((sp_int64)a[ 1]) * a[13]
       +  ((sp_int64)a[ 2]) * a[12]
       +  ((sp_int64)a[ 3]) * a[11]
       +  ((sp_int64)a[ 4]) * a[10]
       +  ((sp_int64)a[ 5]) * a[ 9]
       +  ((sp_int64)a[ 6]) * a[ 8]) * 2
       +  ((sp_int64)a[ 7]) * a[ 7];
    t[13] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_int64)a[ 1]) * a[14]
       +  ((sp_int64)a[ 2]) * a[13]
       +  ((sp_int64)a[ 3]) * a[12]
       +  ((sp_int64)a[ 4]) * a[11]
       +  ((sp_int64)a[ 5]) * a[10]
       +  ((sp_int64)a[ 6]) * a[ 9]
       +  ((sp_int64)a[ 7]) * a[ 8]) * 2;
    t[14] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_int64)a[ 2]) * a[14]
       +  ((sp_int64)a[ 3]) * a[13]
       +  ((sp_int64)a[ 4]) * a[12]
       +  ((sp_int64)a[ 5]) * a[11]
       +  ((sp_int64)a[ 6]) * a[10]
       +  ((sp_int64)a[ 7]) * a[ 9]) * 2
       +  ((sp_int64)a[ 8]) * a[ 8];
    r[15] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_int64)a[ 3]) * a[14]
       +  ((sp_int64)a[ 4]) * a[13]
       +  ((sp_int64)a[ 5]) * a[12]
       +  ((sp_int64)a[ 6]) * a[11]
       +  ((sp_int64)a[ 7]) * a[10]
       +  ((sp_int64)a[ 8]) * a[ 9]) * 2;
    r[16] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_int64)a[ 4]) * a[14]
       +  ((sp_int64)a[ 5]) * a[13]
       +  ((sp_int64)a[ 6]) * a[12]
       +  ((sp_int64)a[ 7]) * a[11]
       +  ((sp_int64)a[ 8]) * a[10]) * 2
       +  ((sp_int64)a[ 9]) * a[ 9];
    r[17] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_int64)a[ 5]) * a[14]
       +  ((sp_int64)a[ 6]) * a[13]
       +  ((sp_int64)a[ 7]) * a[12]
       +  ((sp_int64)a[ 8]) * a[11]
       +  ((sp_int64)a[ 9]) * a[10]) * 2;
    r[18] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_int64)a[ 6]) * a[14]
       +  ((sp_int64)a[ 7]) * a[13]
       +  ((sp_int64)a[ 8]) * a[12]
       +  ((sp_int64)a[ 9]) * a[11]) * 2
       +  ((sp_int64)a[10]) * a[10];
    r[19] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_int64)a[ 7]) * a[14]
       +  ((sp_int64)a[ 8]) * a[13]
       +  ((sp_int64)a[ 9]) * a[12]
       +  ((sp_int64)a[10]) * a[11]) * 2;
    r[20] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_int64)a[ 8]) * a[14]
       +  ((sp_int64)a[ 9]) * a[13]
       +  ((sp_int64)a[10]) * a[12]) * 2
       +  ((sp_int64)a[11]) * a[11];
    r[21] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_int64)a[ 9]) * a[14]
       +  ((sp_int64)a[10]) * a[13]
       +  ((sp_int64)a[11]) * a[12]) * 2;
    r[22] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_int64)a[10]) * a[14]
       +  ((sp_int64)a[11]) * a[13]) * 2
       +  ((sp_int64)a[12]) * a[12];
    r[23] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_int64)a[11]) * a[14]
       +  ((sp_int64)a[12]) * a[13]) * 2;
    r[24] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 = (((sp_int64)a[12]) * a[14]) * 2
       +  ((sp_int64)a[13]) * a[13];
    r[25] = t1 & 0x3ffffff; t0 += t1 >> 26;
    t1 = (((sp_int64)a[13]) * a[14]) * 2;
    r[26] = t0 & 0x3ffffff; t1 += t0 >> 26;
    t0 =  ((sp_int64)a[14]) * a[14];
    r[27] = t1 & 0x3ffffff; t0 += t1 >> 26;
    r[28] = t0 & 0x3ffffff;
    r[29] = (sp_digit)(t0 >> 26);
    XMEMCPY(r, t, sizeof(t));
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_384_add_15(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 15; i++) {
        r[i] = a[i] + b[i];
    }

    return 0;
}
#else
/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_384_add_15(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    r[ 0] = a[ 0] + b[ 0];
    r[ 1] = a[ 1] + b[ 1];
    r[ 2] = a[ 2] + b[ 2];
    r[ 3] = a[ 3] + b[ 3];
    r[ 4] = a[ 4] + b[ 4];
    r[ 5] = a[ 5] + b[ 5];
    r[ 6] = a[ 6] + b[ 6];
    r[ 7] = a[ 7] + b[ 7];
    r[ 8] = a[ 8] + b[ 8];
    r[ 9] = a[ 9] + b[ 9];
    r[10] = a[10] + b[10];
    r[11] = a[11] + b[11];
    r[12] = a[12] + b[12];
    r[13] = a[13] + b[13];
    r[14] = a[14] + b[14];

    return 0;
}

#endif /* WOLFSSL_SP_SMALL */
#ifdef WOLFSSL_SP_SMALL
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_384_sub_15(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    int i;

    for (i = 0; i < 15; i++) {
        r[i] = a[i] - b[i];
    }

    return 0;
}

#else
/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static int sp_384_sub_15(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    r[ 0] = a[ 0] - b[ 0];
    r[ 1] = a[ 1] - b[ 1];
    r[ 2] = a[ 2] - b[ 2];
    r[ 3] = a[ 3] - b[ 3];
    r[ 4] = a[ 4] - b[ 4];
    r[ 5] = a[ 5] - b[ 5];
    r[ 6] = a[ 6] - b[ 6];
    r[ 7] = a[ 7] - b[ 7];
    r[ 8] = a[ 8] - b[ 8];
    r[ 9] = a[ 9] - b[ 9];
    r[10] = a[10] - b[10];
    r[11] = a[11] - b[11];
    r[12] = a[12] - b[12];
    r[13] = a[13] - b[13];
    r[14] = a[14] - b[14];

    return 0;
}

#endif /* WOLFSSL_SP_SMALL */
/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * size  Maximum number of bytes to convert
 * a  A multi-precision integer.
 */
static void sp_384_from_mp(sp_digit* r, int size, const mp_int* a)
{
#if DIGIT_BIT == 26
    int i;
    sp_digit j = (sp_digit)0 - (sp_digit)a->used;
    int o = 0;

    for (i = 0; i < size; i++) {
        sp_digit mask = (sp_digit)0 - (j >> 25);
        r[i] = a->dp[o] & mask;
        j++;
        o += (int)(j >> 25);
    }
#elif DIGIT_BIT > 26
    unsigned int i;
    int j = 0;
    word32 s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i] << s);
        r[j] &= 0x3ffffff;
        s = 26U - s;
        if (j + 1 >= size) {
            break;
        }
        /* lint allow cast of mismatch word32 and mp_digit */
        r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
        while ((s + 26U) <= (word32)DIGIT_BIT) {
            s += 26U;
            r[j] &= 0x3ffffff;
            if (j + 1 >= size) {
                break;
            }
            if (s < (word32)DIGIT_BIT) {
                /* lint allow cast of mismatch word32 and mp_digit */
                r[++j] = (sp_digit)(a->dp[i] >> s); /*lint !e9033*/
            }
            else {
                r[++j] = (sp_digit)0;
            }
        }
        s = (word32)DIGIT_BIT - s;
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#else
    unsigned int i;
    int j = 0;
    int s = 0;

    r[0] = 0;
    for (i = 0; i < (unsigned int)a->used && j < size; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 26) {
            r[j] &= 0x3ffffff;
            if (j + 1 >= size) {
                break;
            }
            s = 26 - s;
            if (s == DIGIT_BIT) {
                r[++j] = 0;
                s = 0;
            }
            else {
                r[++j] = a->dp[i] >> s;
                s = DIGIT_BIT - s;
            }
        }
        else {
            s += DIGIT_BIT;
        }
    }

    for (j++; j < size; j++) {
        r[j] = 0;
    }
#endif
}

/* Convert a point of type ecc_point to type sp_point_384.
 *
 * p   Point of type sp_point_384 (result).
 * pm  Point of type ecc_point.
 */
static void sp_384_point_from_ecc_point_15(sp_point_384* p,
        const ecc_point* pm)
{
    XMEMSET(p->x, 0, sizeof(p->x));
    XMEMSET(p->y, 0, sizeof(p->y));
    XMEMSET(p->z, 0, sizeof(p->z));
    sp_384_from_mp(p->x, 15, pm->x);
    sp_384_from_mp(p->y, 15, pm->y);
    sp_384_from_mp(p->z, 15, pm->z);
    p->infinity = 0;
}

/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_384_to_mp(const sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (384 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) { /*lint !e774 case where err is always MP_OKAY*/
#if DIGIT_BIT == 26
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 15);
        r->used = 15;
        mp_clamp(r);
#elif DIGIT_BIT < 26
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 15; i++) {
            r->dp[j] |= (mp_digit)(a[i] << s);
            r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = (mp_digit)(a[i] >> s);
            while (s + DIGIT_BIT <= 26) {
                s += DIGIT_BIT;
                r->dp[j++] &= ((sp_digit)1 << DIGIT_BIT) - 1;
                if (s == SP_WORD_SIZE) {
                    r->dp[j] = 0;
                }
                else {
                    r->dp[j] = (mp_digit)(a[i] >> s);
                }
            }
            s = 26 - s;
        }
        r->used = (384 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i;
        int j = 0;
        int s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 15; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 26 >= DIGIT_BIT) {
    #if DIGIT_BIT != 32 && DIGIT_BIT != 64
                r->dp[j] &= ((sp_digit)1 << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 26 - s;
            }
            else {
                s += 26;
            }
        }
        r->used = (384 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Convert a point of type sp_point_384 to type ecc_point.
 *
 * p   Point of type sp_point_384.
 * pm  Point of type ecc_point (result).
 * returns MEMORY_E when allocation of memory in ecc_point fails otherwise
 * MP_OKAY.
 */
static int sp_384_point_to_ecc_point_15(const sp_point_384* p, ecc_point* pm)
{
    int err;

    err = sp_384_to_mp(p->x, pm->x);
    if (err == MP_OKAY) {
        err = sp_384_to_mp(p->y, pm->y);
    }
    if (err == MP_OKAY) {
        err = sp_384_to_mp(p->z, pm->z);
    }

    return err;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static sp_digit sp_384_cmp_15(const sp_digit* a, const sp_digit* b)
{
    sp_digit r = 0;
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=14; i>=0; i--) {
        r |= (a[i] - b[i]) & ~(((sp_digit)0 - r) >> 25);
    }
#else
    r |= (a[14] - b[14]) & (0 - (sp_digit)1);
    r |= (a[13] - b[13]) & ~(((sp_digit)0 - r) >> 25);
    r |= (a[12] - b[12]) & ~(((sp_digit)0 - r) >> 25);
    r |= (a[11] - b[11]) & ~(((sp_digit)0 - r) >> 25);
    r |= (a[10] - b[10]) & ~(((sp_digit)0 - r) >> 25);
    r |= (a[ 9] - b[ 9]) & ~(((sp_digit)0 - r) >> 25);
    r |= (a[ 8] - b[ 8]) & ~(((sp_digit)0 - r) >> 25);
    r |= (a[ 7] - b[ 7]) & ~(((sp_digit)0 - r) >> 25);
    r |= (a[ 6] - b[ 6]) & ~(((sp_digit)0 - r) >> 25);
    r |= (a[ 5] - b[ 5]) & ~(((sp_digit)0 - r) >> 25);
    r |= (a[ 4] - b[ 4]) & ~(((sp_digit)0 - r) >> 25);
    r |= (a[ 3] - b[ 3]) & ~(((sp_digit)0 - r) >> 25);
    r |= (a[ 2] - b[ 2]) & ~(((sp_digit)0 - r) >> 25);
    r |= (a[ 1] - b[ 1]) & ~(((sp_digit)0 - r) >> 25);
    r |= (a[ 0] - b[ 0]) & ~(((sp_digit)0 - r) >> 25);
#endif /* WOLFSSL_SP_SMALL */

    return r;
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static void sp_384_cond_sub_15(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 15; i++) {
        r[i] = a[i] - (b[i] & m);
    }
#else
    r[ 0] = a[ 0] - (b[ 0] & m);
    r[ 1] = a[ 1] - (b[ 1] & m);
    r[ 2] = a[ 2] - (b[ 2] & m);
    r[ 3] = a[ 3] - (b[ 3] & m);
    r[ 4] = a[ 4] - (b[ 4] & m);
    r[ 5] = a[ 5] - (b[ 5] & m);
    r[ 6] = a[ 6] - (b[ 6] & m);
    r[ 7] = a[ 7] - (b[ 7] & m);
    r[ 8] = a[ 8] - (b[ 8] & m);
    r[ 9] = a[ 9] - (b[ 9] & m);
    r[10] = a[10] - (b[10] & m);
    r[11] = a[11] - (b[11] & m);
    r[12] = a[12] - (b[12] & m);
    r[13] = a[13] - (b[13] & m);
    r[14] = a[14] - (b[14] & m);
#endif /* WOLFSSL_SP_SMALL */
}

/* Mul a by scalar b and add into r. (r += a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A scalar.
 */
SP_NOINLINE static void sp_384_mul_add_15(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
#ifdef WOLFSSL_SP_SMALL
    sp_int64 tb = b;
    sp_int64 t[4];
    int i;

    t[0] = 0;
    for (i = 0; i < 12; i += 4) {
        t[0] += (tb * a[i+0]) + r[i+0];
        t[1]  = (tb * a[i+1]) + r[i+1];
        t[2]  = (tb * a[i+2]) + r[i+2];
        t[3]  = (tb * a[i+3]) + r[i+3];
        r[i+0] = t[0] & 0x3ffffff;
        t[1] += t[0] >> 26;
        r[i+1] = t[1] & 0x3ffffff;
        t[2] += t[1] >> 26;
        r[i+2] = t[2] & 0x3ffffff;
        t[3] += t[2] >> 26;
        r[i+3] = t[3] & 0x3ffffff;
        t[0]  = t[3] >> 26;
    }
    t[0] += (tb * a[12]) + r[12];
    t[1]  = (tb * a[13]) + r[13];
    t[2]  = (tb * a[14]) + r[14];
    r[12] = t[0] & 0x3ffffff;
    t[1] += t[0] >> 26;
    r[13] = t[1] & 0x3ffffff;
    t[2] += t[1] >> 26;
    r[14] = t[2] & 0x3ffffff;
    r[15] +=  (sp_digit)(t[2] >> 26);
#else
    sp_int64 tb = b;
    sp_int64 t[15];

    t[ 0] = tb * a[ 0];
    t[ 1] = tb * a[ 1];
    t[ 2] = tb * a[ 2];
    t[ 3] = tb * a[ 3];
    t[ 4] = tb * a[ 4];
    t[ 5] = tb * a[ 5];
    t[ 6] = tb * a[ 6];
    t[ 7] = tb * a[ 7];
    t[ 8] = tb * a[ 8];
    t[ 9] = tb * a[ 9];
    t[10] = tb * a[10];
    t[11] = tb * a[11];
    t[12] = tb * a[12];
    t[13] = tb * a[13];
    t[14] = tb * a[14];
    r[ 0] += (sp_digit)                 (t[ 0] & 0x3ffffff);
    r[ 1] += (sp_digit)((t[ 0] >> 26) + (t[ 1] & 0x3ffffff));
    r[ 2] += (sp_digit)((t[ 1] >> 26) + (t[ 2] & 0x3ffffff));
    r[ 3] += (sp_digit)((t[ 2] >> 26) + (t[ 3] & 0x3ffffff));
    r[ 4] += (sp_digit)((t[ 3] >> 26) + (t[ 4] & 0x3ffffff));
    r[ 5] += (sp_digit)((t[ 4] >> 26) + (t[ 5] & 0x3ffffff));
    r[ 6] += (sp_digit)((t[ 5] >> 26) + (t[ 6] & 0x3ffffff));
    r[ 7] += (sp_digit)((t[ 6] >> 26) + (t[ 7] & 0x3ffffff));
    r[ 8] += (sp_digit)((t[ 7] >> 26) + (t[ 8] & 0x3ffffff));
    r[ 9] += (sp_digit)((t[ 8] >> 26) + (t[ 9] & 0x3ffffff));
    r[10] += (sp_digit)((t[ 9] >> 26) + (t[10] & 0x3ffffff));
    r[11] += (sp_digit)((t[10] >> 26) + (t[11] & 0x3ffffff));
    r[12] += (sp_digit)((t[11] >> 26) + (t[12] & 0x3ffffff));
    r[13] += (sp_digit)((t[12] >> 26) + (t[13] & 0x3ffffff));
    r[14] += (sp_digit)((t[13] >> 26) + (t[14] & 0x3ffffff));
    r[15] += (sp_digit) (t[14] >> 26);
#endif /* WOLFSSL_SP_SMALL */
}

/* Normalize the values in each word to 26 bits.
 *
 * a  Array of sp_digit to normalize.
 */
static void sp_384_norm_15(sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    for (i = 0; i < 14; i++) {
        a[i+1] += a[i] >> 26;
        a[i] &= 0x3ffffff;
    }
#else
    a[1] += a[0] >> 26; a[0] &= 0x3ffffff;
    a[2] += a[1] >> 26; a[1] &= 0x3ffffff;
    a[3] += a[2] >> 26; a[2] &= 0x3ffffff;
    a[4] += a[3] >> 26; a[3] &= 0x3ffffff;
    a[5] += a[4] >> 26; a[4] &= 0x3ffffff;
    a[6] += a[5] >> 26; a[5] &= 0x3ffffff;
    a[7] += a[6] >> 26; a[6] &= 0x3ffffff;
    a[8] += a[7] >> 26; a[7] &= 0x3ffffff;
    a[9] += a[8] >> 26; a[8] &= 0x3ffffff;
    a[10] += a[9] >> 26; a[9] &= 0x3ffffff;
    a[11] += a[10] >> 26; a[10] &= 0x3ffffff;
    a[12] += a[11] >> 26; a[11] &= 0x3ffffff;
    a[13] += a[12] >> 26; a[12] &= 0x3ffffff;
    a[14] += a[13] >> 26; a[13] &= 0x3ffffff;
#endif /* WOLFSSL_SP_SMALL */
}

/* Shift the result in the high 384 bits down to the bottom.
 *
 * r  A single precision number.
 * a  A single precision number.
 */
static void sp_384_mont_shift_15(sp_digit* r, const sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;
    sp_int64 n = a[14] >> 20;
    n += ((sp_int64)a[15]) << 6;

    for (i = 0; i < 14; i++) {
        r[i] = n & 0x3ffffff;
        n >>= 26;
        n += ((sp_int64)a[16 + i]) << 6;
    }
    r[14] = (sp_digit)n;
#else
    sp_int64 n = a[14] >> 20;
    n += ((sp_int64)a[15]) << 6;
    r[ 0] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[16]) << 6;
    r[ 1] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[17]) << 6;
    r[ 2] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[18]) << 6;
    r[ 3] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[19]) << 6;
    r[ 4] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[20]) << 6;
    r[ 5] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[21]) << 6;
    r[ 6] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[22]) << 6;
    r[ 7] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[23]) << 6;
    r[ 8] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[24]) << 6;
    r[ 9] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[25]) << 6;
    r[10] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[26]) << 6;
    r[11] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[27]) << 6;
    r[12] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[28]) << 6;
    r[13] = n & 0x3ffffff; n >>= 26; n += ((sp_int64)a[29]) << 6;
    r[14] = (sp_digit)n;
#endif /* WOLFSSL_SP_SMALL */
    XMEMSET(&r[15], 0, sizeof(*r) * 15U);
}

/* Reduce the number back to 384 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_384_mont_reduce_order_15(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit mu;
    sp_digit over;

    sp_384_norm_15(a + 15);

    for (i=0; i<14; i++) {
        mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0x3ffffff;
        sp_384_mul_add_15(a+i, m, mu);
        a[i+1] += a[i] >> 26;
    }
    mu = ((sp_uint32)a[i] * (sp_uint32)mp) & 0xfffffL;
    sp_384_mul_add_15(a+i, m, mu);
    a[i+1] += a[i] >> 26;
    a[i] &= 0x3ffffff;
    sp_384_mont_shift_15(a, a);
    over = a[14] >> 20;
    sp_384_cond_sub_15(a, a, m, ~((over - 1) >> 31));
    sp_384_norm_15(a);
}

/* Reduce the number back to 384 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
static void sp_384_mont_reduce_15(sp_digit* a, const sp_digit* m, sp_digit mp)
{
    int i;
    sp_digit am;

    (void)m;
    (void)mp;

    for (i = 0; i < 14; i++) {
        am = (a[i] * 0x1) & 0x3ffffff;
        a[i +  1] += (am << 6) & 0x3ffffff;
        a[i +  2] += am >> 20;
        a[i +  3] -= (am << 18) & 0x3ffffff;
        a[i +  4] -= am >> 8;
        a[i +  4] -= (am << 24) & 0x3ffffff;
        a[i +  5] -= am >> 2;
        a[i + 14] += (am << 20) & 0x3ffffff;
        a[i + 15] += am >> 6;

        a[i +  1] += a[i] >> 26;
    }
    am = (a[14] * 0x1) & 0xfffff;
    a[14 +  1] += (am << 6) & 0x3ffffff;
    a[14 +  2] += am >> 20;
    a[14 +  3] -= (am << 18) & 0x3ffffff;
    a[14 +  4] -= am >> 8;
    a[14 +  4] -= (am << 24) & 0x3ffffff;
    a[14 +  5] -= am >> 2;
    a[14 + 14] += (am << 20) & 0x3ffffff;
    a[14 + 15] += am >> 6;

    a[0] = (a[14] >> 20) + ((a[15] << 6) & 0x3ffffff);
    a[1] = (a[15] >> 20) + ((a[16] << 6) & 0x3ffffff);
    a[2] = (a[16] >> 20) + ((a[17] << 6) & 0x3ffffff);
    a[3] = (a[17] >> 20) + ((a[18] << 6) & 0x3ffffff);
    a[4] = (a[18] >> 20) + ((a[19] << 6) & 0x3ffffff);
    a[5] = (a[19] >> 20) + ((a[20] << 6) & 0x3ffffff);
    a[6] = (a[20] >> 20) + ((a[21] << 6) & 0x3ffffff);
    a[7] = (a[21] >> 20) + ((a[22] << 6) & 0x3ffffff);
    a[8] = (a[22] >> 20) + ((a[23] << 6) & 0x3ffffff);
    a[9] = (a[23] >> 20) + ((a[24] << 6) & 0x3ffffff);
    a[10] = (a[24] >> 20) + ((a[25] << 6) & 0x3ffffff);
    a[11] = (a[25] >> 20) + ((a[26] << 6) & 0x3ffffff);
    a[12] = (a[26] >> 20) + ((a[27] << 6) & 0x3ffffff);
    a[13] = (a[27] >> 20) + ((a[28] << 6) & 0x3ffffff);
    a[14] = (a[14 + 14] >> 20) +  (a[29] << 6);

    a[1] += a[0] >> 26; a[0] &= 0x3ffffff;
    a[2] += a[1] >> 26; a[1] &= 0x3ffffff;
    a[3] += a[2] >> 26; a[2] &= 0x3ffffff;
    a[4] += a[3] >> 26; a[3] &= 0x3ffffff;
    a[5] += a[4] >> 26; a[4] &= 0x3ffffff;
    a[6] += a[5] >> 26; a[5] &= 0x3ffffff;
    a[7] += a[6] >> 26; a[6] &= 0x3ffffff;
    a[8] += a[7] >> 26; a[7] &= 0x3ffffff;
    a[9] += a[8] >> 26; a[8] &= 0x3ffffff;
    a[10] += a[9] >> 26; a[9] &= 0x3ffffff;
    a[11] += a[10] >> 26; a[10] &= 0x3ffffff;
    a[12] += a[11] >> 26; a[11] &= 0x3ffffff;
    a[13] += a[12] >> 26; a[12] &= 0x3ffffff;
    a[14] += a[13] >> 26; a[13] &= 0x3ffffff;

    /* Get the bit over, if any. */
    am = a[14] >> 20;
    /* Create mask. */
    am = 0 - am;

    a[0] -= 0x03ffffff & am;
    a[1] -= 0x0000003f & am;
    /* p384_mod[2] is zero */
    a[3] -= 0x03fc0000 & am;
    a[4] -= 0x02ffffff & am;
    a[5] -= 0x03ffffff & am;
    a[6] -= 0x03ffffff & am;
    a[7] -= 0x03ffffff & am;
    a[8] -= 0x03ffffff & am;
    a[9] -= 0x03ffffff & am;
    a[10] -= 0x03ffffff & am;
    a[11] -= 0x03ffffff & am;
    a[12] -= 0x03ffffff & am;
    a[13] -= 0x03ffffff & am;
    a[14] -= 0x000fffff & am;

    a[1] += a[0] >> 26; a[0] &= 0x3ffffff;
    a[2] += a[1] >> 26; a[1] &= 0x3ffffff;
    a[3] += a[2] >> 26; a[2] &= 0x3ffffff;
    a[4] += a[3] >> 26; a[3] &= 0x3ffffff;
    a[5] += a[4] >> 26; a[4] &= 0x3ffffff;
    a[6] += a[5] >> 26; a[5] &= 0x3ffffff;
    a[7] += a[6] >> 26; a[6] &= 0x3ffffff;
    a[8] += a[7] >> 26; a[7] &= 0x3ffffff;
    a[9] += a[8] >> 26; a[8] &= 0x3ffffff;
    a[10] += a[9] >> 26; a[9] &= 0x3ffffff;
    a[11] += a[10] >> 26; a[10] &= 0x3ffffff;
    a[12] += a[11] >> 26; a[11] &= 0x3ffffff;
    a[13] += a[12] >> 26; a[12] &= 0x3ffffff;
    a[14] += a[13] >> 26; a[13] &= 0x3ffffff;
}

/* Multiply two Montgomery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montgomery form.
 * b   Second number to multiply in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_384_mont_mul_15(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit* m, sp_digit mp)
{
    sp_384_mul_15(r, a, b);
    sp_384_mont_reduce_15(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_384_mont_sqr_15(sp_digit* r, const sp_digit* a,
        const sp_digit* m, sp_digit mp)
{
    sp_384_sqr_15(r, a);
    sp_384_mont_reduce_15(r, m, mp);
}

#if !defined(WOLFSSL_SP_SMALL) || defined(HAVE_COMP_KEY)
/* Square the Montgomery form number a number of times. (r = a ^ n mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montgomery form.
 * n   Number of times to square.
 * m   Modulus (prime).
 * mp  Montgomery multiplier.
 */
SP_NOINLINE static void sp_384_mont_sqr_n_15(sp_digit* r,
    const sp_digit* a, int n, const sp_digit* m, sp_digit mp)
{
    sp_384_mont_sqr_15(r, a, m, mp);
    for (; n > 1; n--) {
        sp_384_mont_sqr_15(r, r, m, mp);
    }
}

#endif /* !WOLFSSL_SP_SMALL || HAVE_COMP_KEY */
#ifdef WOLFSSL_SP_SMALL
/* Mod-2 for the P384 curve. */
static const uint32_t p384_mod_minus_2[12] = {
    0xfffffffdU,0x00000000U,0x00000000U,0xffffffffU,0xfffffffeU,0xffffffffU,
    0xffffffffU,0xffffffffU,0xffffffffU,0xffffffffU,0xffffffffU,0xffffffffU
};
#endif /* !WOLFSSL_SP_SMALL */

/* Invert the number, in Montgomery form, modulo the modulus (prime) of the
 * P384 curve. (r = 1 / a mod m)
 *
 * r   Inverse result.
 * a   Number to invert.
 * td  Temporary data.
 */
static void sp_384_mont_inv_15(sp_digit* r, const sp_digit* a, sp_digit* td)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* t = td;
    int i;

    XMEMCPY(t, a, sizeof(sp_digit) * 15);
    for (i=382; i>=0; i--) {
        sp_384_mont_sqr_15(t, t, p384_mod, p384_mp_mod);
        if (p384_mod_minus_2[i / 32] & ((sp_digit)1 << (i % 32)))
            sp_384_mont_mul_15(t, t, a, p384_mod, p384_mp_mod);
    }
    XMEMCPY(r, t, sizeof(sp_digit) * 15);
#else
    sp_digit* t1 = td;
    sp_digit* t2 = td + 2 * 15;
    sp_digit* t3 = td + 4 * 15;
    sp_digit* t4 = td + 6 * 15;
    sp_digit* t5 = td + 8 * 15;

    /* 0x2 */
    sp_384_mont_sqr_15(t1, a, p384_mod, p384_mp_mod);
    /* 0x3 */
    sp_384_mont_mul_15(t5, t1, a, p384_mod, p384_mp_mod);
    /* 0xc */
    sp_384_mont_sqr_n_15(t1, t5, 2, p384_mod, p384_mp_mod);
    /* 0xf */
    sp_384_mont_mul_15(t2, t5, t1, p384_mod, p384_mp_mod);
    /* 0x1e */
    sp_384_mont_sqr_15(t1, t2, p384_mod, p384_mp_mod);
    /* 0x1f */
    sp_384_mont_mul_15(t4, t1, a, p384_mod, p384_mp_mod);
    /* 0x3e0 */
    sp_384_mont_sqr_n_15(t1, t4, 5, p384_mod, p384_mp_mod);
    /* 0x3ff */
    sp_384_mont_mul_15(t2, t4, t1, p384_mod, p384_mp_mod);
    /* 0x7fe0 */
    sp_384_mont_sqr_n_15(t1, t2, 5, p384_mod, p384_mp_mod);
    /* 0x7fff */
    sp_384_mont_mul_15(t4, t4, t1, p384_mod, p384_mp_mod);
    /* 0x3fff8000 */
    sp_384_mont_sqr_n_15(t1, t4, 15, p384_mod, p384_mp_mod);
    /* 0x3fffffff */
    sp_384_mont_mul_15(t2, t4, t1, p384_mod, p384_mp_mod);
    /* 0xfffffffc */
    sp_384_mont_sqr_n_15(t3, t2, 2, p384_mod, p384_mp_mod);
    /* 0xfffffffd */
    sp_384_mont_mul_15(r, t3, a, p384_mod, p384_mp_mod);
    /* 0xffffffff */
    sp_384_mont_mul_15(t3, t5, t3, p384_mod, p384_mp_mod);
    /* 0xfffffffc0000000 */
    sp_384_mont_sqr_n_15(t1, t2, 30, p384_mod, p384_mp_mod);
    /* 0xfffffffffffffff */
    sp_384_mont_mul_15(t2, t2, t1, p384_mod, p384_mp_mod);
    /* 0xfffffffffffffff000000000000000 */
    sp_384_mont_sqr_n_15(t1, t2, 60, p384_mod, p384_mp_mod);
    /* 0xffffffffffffffffffffffffffffff */
    sp_384_mont_mul_15(t2, t2, t1, p384_mod, p384_mp_mod);
    /* 0xffffffffffffffffffffffffffffff000000000000000000000000000000 */
    sp_384_mont_sqr_n_15(t1, t2, 120, p384_mod, p384_mp_mod);
    /* 0xffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff */
    sp_384_mont_mul_15(t2, t2, t1, p384_mod, p384_mp_mod);
    /* 0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff8000 */
    sp_384_mont_sqr_n_15(t1, t2, 15, p384_mod, p384_mp_mod);
    /* 0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff */
    sp_384_mont_mul_15(t2, t4, t1, p384_mod, p384_mp_mod);
    /* 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffe00000000 */
    sp_384_mont_sqr_n_15(t1, t2, 33, p384_mod, p384_mp_mod);
    /* 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff */
    sp_384_mont_mul_15(t2, t3, t1, p384_mod, p384_mp_mod);
    /* 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff000000000000000000000000 */
    sp_384_mont_sqr_n_15(t1, t2, 96, p384_mod, p384_mp_mod);
    /* 0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffeffffffff0000000000000000fffffffd */
    sp_384_mont_mul_15(r, r, t1, p384_mod, p384_mp_mod);

#endif /* WOLFSSL_SP_SMALL */
}

/* Map the Montgomery form projective coordinate point to an affine point.
 *
 * r  Resulting affine coordinate point.
 * p  Montgomery form projective coordinate point.
 * t  Temporary ordinate data.
 */
static void sp_384_map_15(sp_point_384* r, const sp_point_384* p,
    sp_digit* t)
{
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2*15;
    sp_int32 n;

    sp_384_mont_inv_15(t1, p->z, t + 2*15);

    sp_384_mont_sqr_15(t2, t1, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(t1, t2, t1, p384_mod, p384_mp_mod);

    /* x /= z^2 */
    sp_384_mont_mul_15(r->x, p->x, t2, p384_mod, p384_mp_mod);
    XMEMSET(r->x + 15, 0, sizeof(sp_digit) * 15U);
    sp_384_mont_reduce_15(r->x, p384_mod, p384_mp_mod);
    /* Reduce x to less than modulus */
    n = sp_384_cmp_15(r->x, p384_mod);
    sp_384_cond_sub_15(r->x, r->x, p384_mod, ~(n >> 25));
    sp_384_norm_15(r->x);

    /* y /= z^3 */
    sp_384_mont_mul_15(r->y, p->y, t1, p384_mod, p384_mp_mod);
    XMEMSET(r->y + 15, 0, sizeof(sp_digit) * 15U);
    sp_384_mont_reduce_15(r->y, p384_mod, p384_mp_mod);
    /* Reduce y to less than modulus */
    n = sp_384_cmp_15(r->y, p384_mod);
    sp_384_cond_sub_15(r->y, r->y, p384_mod, ~(n >> 25));
    sp_384_norm_15(r->y);

    XMEMSET(r->z, 0, sizeof(r->z) / 2);
    r->z[0] = 1;
}

/* Add two Montgomery form numbers (r = a + b % m).
 *
 * r   Result of addition.
 * a   First number to add in Montgomery form.
 * b   Second number to add in Montgomery form.
 * m   Modulus (prime).
 */
static void sp_384_mont_add_15(sp_digit* r, const sp_digit* a, const sp_digit* b,
        const sp_digit* m)
{
    sp_digit over;
    (void)sp_384_add_15(r, a, b);
    sp_384_norm_15(r);
    over = r[14] >> 20;
    sp_384_cond_sub_15(r, r, m, ~((over - 1) >> 31));
    sp_384_norm_15(r);
}

/* Double a Montgomery form number (r = a + a % m).
 *
 * r   Result of doubling.
 * a   Number to double in Montgomery form.
 * m   Modulus (prime).
 */
static void sp_384_mont_dbl_15(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    sp_digit over;
    (void)sp_384_add_15(r, a, a);
    sp_384_norm_15(r);
    over = r[14] >> 20;
    sp_384_cond_sub_15(r, r, m, ~((over - 1) >> 31));
    sp_384_norm_15(r);
}

/* Triple a Montgomery form number (r = a + a + a % m).
 *
 * r   Result of Tripling.
 * a   Number to triple in Montgomery form.
 * m   Modulus (prime).
 */
static void sp_384_mont_tpl_15(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
    sp_digit over;
    (void)sp_384_add_15(r, a, a);
    sp_384_norm_15(r);
    over = r[14] >> 20;
    sp_384_cond_sub_15(r, r, m, ~((over - 1) >> 31));
    sp_384_norm_15(r);
    (void)sp_384_add_15(r, r, a);
    sp_384_norm_15(r);
    over = r[14] >> 20;
    sp_384_cond_sub_15(r, r, m, ~((over - 1) >> 31));
    sp_384_norm_15(r);
}

#ifdef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_384_cond_add_15(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    int i;

    for (i = 0; i < 15; i++) {
        r[i] = a[i] + (b[i] & m);
    }
}
#endif /* WOLFSSL_SP_SMALL */

#ifndef WOLFSSL_SP_SMALL
/* Conditionally add a and b using the mask m.
 * m is -1 to add and 0 when not.
 *
 * r  A single precision number representing conditional add result.
 * a  A single precision number to add with.
 * b  A single precision number to add.
 * m  Mask value to apply.
 */
static void sp_384_cond_add_15(sp_digit* r, const sp_digit* a,
        const sp_digit* b, const sp_digit m)
{
    r[ 0] = a[ 0] + (b[ 0] & m);
    r[ 1] = a[ 1] + (b[ 1] & m);
    r[ 2] = a[ 2] + (b[ 2] & m);
    r[ 3] = a[ 3] + (b[ 3] & m);
    r[ 4] = a[ 4] + (b[ 4] & m);
    r[ 5] = a[ 5] + (b[ 5] & m);
    r[ 6] = a[ 6] + (b[ 6] & m);
    r[ 7] = a[ 7] + (b[ 7] & m);
    r[ 8] = a[ 8] + (b[ 8] & m);
    r[ 9] = a[ 9] + (b[ 9] & m);
    r[10] = a[10] + (b[10] & m);
    r[11] = a[11] + (b[11] & m);
    r[12] = a[12] + (b[12] & m);
    r[13] = a[13] + (b[13] & m);
    r[14] = a[14] + (b[14] & m);
}
#endif /* !WOLFSSL_SP_SMALL */

/* Subtract two Montgomery form numbers (r = a - b % m).
 *
 * r   Result of subtration.
 * a   Number to subtract from in Montgomery form.
 * b   Number to subtract with in Montgomery form.
 * m   Modulus (prime).
 */
static void sp_384_mont_sub_15(sp_digit* r, const sp_digit* a, const sp_digit* b,
        const sp_digit* m)
{
    (void)sp_384_sub_15(r, a, b);
    sp_384_norm_15(r);
    sp_384_cond_add_15(r, r, m, r[14] >> 20);
    sp_384_norm_15(r);
}

/* Shift number left one bit.
 * Bottom bit is lost.
 *
 * r  Result of shift.
 * a  Number to shift.
 */
SP_NOINLINE static void sp_384_rshift1_15(sp_digit* r, const sp_digit* a)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<14; i++) {
        r[i] = (a[i] >> 1) + ((a[i + 1] << 25) & 0x3ffffff);
    }
#else
    r[0] = (a[0] >> 1) + ((a[1] << 25) & 0x3ffffff);
    r[1] = (a[1] >> 1) + ((a[2] << 25) & 0x3ffffff);
    r[2] = (a[2] >> 1) + ((a[3] << 25) & 0x3ffffff);
    r[3] = (a[3] >> 1) + ((a[4] << 25) & 0x3ffffff);
    r[4] = (a[4] >> 1) + ((a[5] << 25) & 0x3ffffff);
    r[5] = (a[5] >> 1) + ((a[6] << 25) & 0x3ffffff);
    r[6] = (a[6] >> 1) + ((a[7] << 25) & 0x3ffffff);
    r[7] = (a[7] >> 1) + ((a[8] << 25) & 0x3ffffff);
    r[8] = (a[8] >> 1) + ((a[9] << 25) & 0x3ffffff);
    r[9] = (a[9] >> 1) + ((a[10] << 25) & 0x3ffffff);
    r[10] = (a[10] >> 1) + ((a[11] << 25) & 0x3ffffff);
    r[11] = (a[11] >> 1) + ((a[12] << 25) & 0x3ffffff);
    r[12] = (a[12] >> 1) + ((a[13] << 25) & 0x3ffffff);
    r[13] = (a[13] >> 1) + ((a[14] << 25) & 0x3ffffff);
#endif
    r[14] = a[14] >> 1;
}

/* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
 *
 * r  Result of division by 2.
 * a  Number to divide.
 * m  Modulus (prime).
 */
static void sp_384_mont_div2_15(sp_digit* r, const sp_digit* a,
        const sp_digit* m)
{
    sp_384_cond_add_15(r, a, m, 0 - (a[0] & 1));
    sp_384_norm_15(r);
    sp_384_rshift1_15(r, r);
}

/* Double the Montgomery form projective point p.
 *
 * r  Result of doubling point.
 * p  Point to double.
 * t  Temporary ordinate data.
 */
static void sp_384_proj_point_dbl_15(sp_point_384* r, const sp_point_384* p,
    sp_digit* t)
{
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2*15;
    sp_digit* x;
    sp_digit* y;
    sp_digit* z;

    x = r->x;
    y = r->y;
    z = r->z;
    /* Put infinity into result. */
    if (r != p) {
        r->infinity = p->infinity;
    }

    /* T1 = Z * Z */
    sp_384_mont_sqr_15(t1, p->z, p384_mod, p384_mp_mod);
    /* Z = Y * Z */
    sp_384_mont_mul_15(z, p->y, p->z, p384_mod, p384_mp_mod);
    /* Z = 2Z */
    sp_384_mont_dbl_15(z, z, p384_mod);
    /* T2 = X - T1 */
    sp_384_mont_sub_15(t2, p->x, t1, p384_mod);
    /* T1 = X + T1 */
    sp_384_mont_add_15(t1, p->x, t1, p384_mod);
    /* T2 = T1 * T2 */
    sp_384_mont_mul_15(t2, t1, t2, p384_mod, p384_mp_mod);
    /* T1 = 3T2 */
    sp_384_mont_tpl_15(t1, t2, p384_mod);
    /* Y = 2Y */
    sp_384_mont_dbl_15(y, p->y, p384_mod);
    /* Y = Y * Y */
    sp_384_mont_sqr_15(y, y, p384_mod, p384_mp_mod);
    /* T2 = Y * Y */
    sp_384_mont_sqr_15(t2, y, p384_mod, p384_mp_mod);
    /* T2 = T2/2 */
    sp_384_mont_div2_15(t2, t2, p384_mod);
    /* Y = Y * X */
    sp_384_mont_mul_15(y, y, p->x, p384_mod, p384_mp_mod);
    /* X = T1 * T1 */
    sp_384_mont_sqr_15(x, t1, p384_mod, p384_mp_mod);
    /* X = X - Y */
    sp_384_mont_sub_15(x, x, y, p384_mod);
    /* X = X - Y */
    sp_384_mont_sub_15(x, x, y, p384_mod);
    /* Y = Y - X */
    sp_384_mont_sub_15(y, y, x, p384_mod);
    /* Y = Y * T1 */
    sp_384_mont_mul_15(y, y, t1, p384_mod, p384_mp_mod);
    /* Y = Y - T2 */
    sp_384_mont_sub_15(y, y, t2, p384_mod);
}

#ifdef WOLFSSL_SP_NONBLOCK
typedef struct sp_384_proj_point_dbl_15_ctx {
    int state;
    sp_digit* t1;
    sp_digit* t2;
    sp_digit* x;
    sp_digit* y;
    sp_digit* z;
} sp_384_proj_point_dbl_15_ctx;

/* Double the Montgomery form projective point p.
 *
 * r  Result of doubling point.
 * p  Point to double.
 * t  Temporary ordinate data.
 */
static int sp_384_proj_point_dbl_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
        const sp_point_384* p, sp_digit* t)
{
    int err = FP_WOULDBLOCK;
    sp_384_proj_point_dbl_15_ctx* ctx = (sp_384_proj_point_dbl_15_ctx*)sp_ctx->data;

    typedef char ctx_size_test[sizeof(sp_384_proj_point_dbl_15_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
    (void)sizeof(ctx_size_test);

    switch (ctx->state) {
    case 0:
        ctx->t1 = t;
        ctx->t2 = t + 2*15;
        ctx->x = r->x;
        ctx->y = r->y;
        ctx->z = r->z;

        /* Put infinity into result. */
        if (r != p) {
            r->infinity = p->infinity;
        }
        ctx->state = 1;
        break;
    case 1:
        /* T1 = Z * Z */
        sp_384_mont_sqr_15(ctx->t1, p->z, p384_mod, p384_mp_mod);
        ctx->state = 2;
        break;
    case 2:
        /* Z = Y * Z */
        sp_384_mont_mul_15(ctx->z, p->y, p->z, p384_mod, p384_mp_mod);
        ctx->state = 3;
        break;
    case 3:
        /* Z = 2Z */
        sp_384_mont_dbl_15(ctx->z, ctx->z, p384_mod);
        ctx->state = 4;
        break;
    case 4:
        /* T2 = X - T1 */
        sp_384_mont_sub_15(ctx->t2, p->x, ctx->t1, p384_mod);
        ctx->state = 5;
        break;
    case 5:
        /* T1 = X + T1 */
        sp_384_mont_add_15(ctx->t1, p->x, ctx->t1, p384_mod);
        ctx->state = 6;
        break;
    case 6:
        /* T2 = T1 * T2 */
        sp_384_mont_mul_15(ctx->t2, ctx->t1, ctx->t2, p384_mod, p384_mp_mod);
        ctx->state = 7;
        break;
    case 7:
        /* T1 = 3T2 */
        sp_384_mont_tpl_15(ctx->t1, ctx->t2, p384_mod);
        ctx->state = 8;
        break;
    case 8:
        /* Y = 2Y */
        sp_384_mont_dbl_15(ctx->y, p->y, p384_mod);
        ctx->state = 9;
        break;
    case 9:
        /* Y = Y * Y */
        sp_384_mont_sqr_15(ctx->y, ctx->y, p384_mod, p384_mp_mod);
        ctx->state = 10;
        break;
    case 10:
        /* T2 = Y * Y */
        sp_384_mont_sqr_15(ctx->t2, ctx->y, p384_mod, p384_mp_mod);
        ctx->state = 11;
        break;
    case 11:
        /* T2 = T2/2 */
        sp_384_mont_div2_15(ctx->t2, ctx->t2, p384_mod);
        ctx->state = 12;
        break;
    case 12:
        /* Y = Y * X */
        sp_384_mont_mul_15(ctx->y, ctx->y, p->x, p384_mod, p384_mp_mod);
        ctx->state = 13;
        break;
    case 13:
        /* X = T1 * T1 */
        sp_384_mont_sqr_15(ctx->x, ctx->t1, p384_mod, p384_mp_mod);
        ctx->state = 14;
        break;
    case 14:
        /* X = X - Y */
        sp_384_mont_sub_15(ctx->x, ctx->x, ctx->y, p384_mod);
        ctx->state = 15;
        break;
    case 15:
        /* X = X - Y */
        sp_384_mont_sub_15(ctx->x, ctx->x, ctx->y, p384_mod);
        ctx->state = 16;
        break;
    case 16:
        /* Y = Y - X */
        sp_384_mont_sub_15(ctx->y, ctx->y, ctx->x, p384_mod);
        ctx->state = 17;
        break;
    case 17:
        /* Y = Y * T1 */
        sp_384_mont_mul_15(ctx->y, ctx->y, ctx->t1, p384_mod, p384_mp_mod);
        ctx->state = 18;
        break;
    case 18:
        /* Y = Y - T2 */
        sp_384_mont_sub_15(ctx->y, ctx->y, ctx->t2, p384_mod);
        ctx->state = 19;
        /* fall-through */
    case 19:
        err = MP_OKAY;
        break;
    }

    if (err == MP_OKAY && ctx->state != 19) {
        err = FP_WOULDBLOCK;
    }

    return err;
}
#endif /* WOLFSSL_SP_NONBLOCK */
/* Compare two numbers to determine if they are equal.
 * Constant time implementation.
 *
 * a  First number to compare.
 * b  Second number to compare.
 * returns 1 when equal and 0 otherwise.
 */
static int sp_384_cmp_equal_15(const sp_digit* a, const sp_digit* b)
{
    return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2]) |
            (a[3] ^ b[3]) | (a[4] ^ b[4]) | (a[5] ^ b[5]) |
            (a[6] ^ b[6]) | (a[7] ^ b[7]) | (a[8] ^ b[8]) |
            (a[9] ^ b[9]) | (a[10] ^ b[10]) | (a[11] ^ b[11]) |
            (a[12] ^ b[12]) | (a[13] ^ b[13]) | (a[14] ^ b[14])) == 0;
}

/* Returns 1 if the number of zero.
 * Implementation is constant time.
 *
 * a  Number to check.
 * returns 1 if the number is zero and 0 otherwise.
 */
static int sp_384_iszero_15(const sp_digit* a)
{
    return (a[0] | a[1] | a[2] | a[3] | a[4] | a[5] | a[6] | a[7] |
            a[8] | a[9] | a[10] | a[11] | a[12] | a[13] | a[14]) == 0;
}


/* Add two Montgomery form projective points.
 *
 * r  Result of addition.
 * p  First point to add.
 * q  Second point to add.
 * t  Temporary ordinate data.
 */
static void sp_384_proj_point_add_15(sp_point_384* r,
        const sp_point_384* p, const sp_point_384* q, sp_digit* t)
{
    sp_digit* t6 = t;
    sp_digit* t1 = t + 2*15;
    sp_digit* t2 = t + 4*15;
    sp_digit* t3 = t + 6*15;
    sp_digit* t4 = t + 8*15;
    sp_digit* t5 = t + 10*15;

    /* U1 = X1*Z2^2 */
    sp_384_mont_sqr_15(t1, q->z, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(t3, t1, q->z, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(t1, t1, p->x, p384_mod, p384_mp_mod);
    /* U2 = X2*Z1^2 */
    sp_384_mont_sqr_15(t2, p->z, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(t4, t2, p->z, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(t2, t2, q->x, p384_mod, p384_mp_mod);
    /* S1 = Y1*Z2^3 */
    sp_384_mont_mul_15(t3, t3, p->y, p384_mod, p384_mp_mod);
    /* S2 = Y2*Z1^3 */
    sp_384_mont_mul_15(t4, t4, q->y, p384_mod, p384_mp_mod);

    /* Check double */
    if ((~p->infinity) & (~q->infinity) &
            sp_384_cmp_equal_15(t2, t1) &
            sp_384_cmp_equal_15(t4, t3)) {
        sp_384_proj_point_dbl_15(r, p, t);
    }
    else {
        sp_digit* x = t6;
        sp_digit* y = t1;
        sp_digit* z = t2;

        /* H = U2 - U1 */
        sp_384_mont_sub_15(t2, t2, t1, p384_mod);
        /* R = S2 - S1 */
        sp_384_mont_sub_15(t4, t4, t3, p384_mod);
        /* X3 = R^2 - H^3 - 2*U1*H^2 */
        sp_384_mont_sqr_15(t5, t2, p384_mod, p384_mp_mod);
        sp_384_mont_mul_15(y, t1, t5, p384_mod, p384_mp_mod);
        sp_384_mont_mul_15(t5, t5, t2, p384_mod, p384_mp_mod);
        /* Z3 = H*Z1*Z2 */
        sp_384_mont_mul_15(z, p->z, t2, p384_mod, p384_mp_mod);
        sp_384_mont_mul_15(z, z, q->z, p384_mod, p384_mp_mod);
        sp_384_mont_sqr_15(x, t4, p384_mod, p384_mp_mod);
        sp_384_mont_sub_15(x, x, t5, p384_mod);
        sp_384_mont_mul_15(t5, t5, t3, p384_mod, p384_mp_mod);
        sp_384_mont_dbl_15(t3, y, p384_mod);
        sp_384_mont_sub_15(x, x, t3, p384_mod);
        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
        sp_384_mont_sub_15(y, y, x, p384_mod);
        sp_384_mont_mul_15(y, y, t4, p384_mod, p384_mp_mod);
        sp_384_mont_sub_15(y, y, t5, p384_mod);
        {
            int i;
            sp_digit maskp = 0 - (q->infinity & (!p->infinity));
            sp_digit maskq = 0 - (p->infinity & (!q->infinity));
            sp_digit maskt = ~(maskp | maskq);
            sp_digit inf = (sp_digit)(p->infinity & q->infinity);

            for (i = 0; i < 15; i++) {
                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
                          (x[i] & maskt);
            }
            for (i = 0; i < 15; i++) {
                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
                          (y[i] & maskt);
            }
            for (i = 0; i < 15; i++) {
                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
                          (z[i] & maskt);
            }
            r->z[0] |= inf;
            r->infinity = (word32)inf;
        }
    }
}

#ifdef WOLFSSL_SP_NONBLOCK
typedef struct sp_384_proj_point_add_15_ctx {
    int state;
    sp_384_proj_point_dbl_15_ctx dbl_ctx;
    const sp_point_384* ap[2];
    sp_point_384* rp[2];
    sp_digit* t1;
    sp_digit* t2;
    sp_digit* t3;
    sp_digit* t4;
    sp_digit* t5;
    sp_digit* t6;
    sp_digit* x;
    sp_digit* y;
    sp_digit* z;
} sp_384_proj_point_add_15_ctx;

/* Add two Montgomery form projective points.
 *
 * r  Result of addition.
 * p  First point to add.
 * q  Second point to add.
 * t  Temporary ordinate data.
 */
static int sp_384_proj_point_add_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
    const sp_point_384* p, const sp_point_384* q, sp_digit* t)
{
    int err = FP_WOULDBLOCK;
    sp_384_proj_point_add_15_ctx* ctx = (sp_384_proj_point_add_15_ctx*)sp_ctx->data;

    /* Ensure only the first point is the same as the result. */
    if (q == r) {
        const sp_point_384* a = p;
        p = q;
        q = a;
    }

    typedef char ctx_size_test[sizeof(sp_384_proj_point_add_15_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
    (void)sizeof(ctx_size_test);

    switch (ctx->state) {
    case 0: /* INIT */
        ctx->t6 = t;
        ctx->t1 = t + 2*15;
        ctx->t2 = t + 4*15;
        ctx->t3 = t + 6*15;
        ctx->t4 = t + 8*15;
        ctx->t5 = t + 10*15;
        ctx->x = ctx->t6;
        ctx->y = ctx->t1;
        ctx->z = ctx->t2;

        ctx->state = 1;
        break;
    case 1:
        /* U1 = X1*Z2^2 */
        sp_384_mont_sqr_15(ctx->t1, q->z, p384_mod, p384_mp_mod);
        ctx->state = 2;
        break;
    case 2:
        sp_384_mont_mul_15(ctx->t3, ctx->t1, q->z, p384_mod, p384_mp_mod);
        ctx->state = 3;
        break;
    case 3:
        sp_384_mont_mul_15(ctx->t1, ctx->t1, p->x, p384_mod, p384_mp_mod);
        ctx->state = 4;
        break;
    case 4:
        /* U2 = X2*Z1^2 */
        sp_384_mont_sqr_15(ctx->t2, p->z, p384_mod, p384_mp_mod);
        ctx->state = 5;
        break;
    case 5:
        sp_384_mont_mul_15(ctx->t4, ctx->t2, p->z, p384_mod, p384_mp_mod);
        ctx->state = 6;
        break;
    case 6:
        sp_384_mont_mul_15(ctx->t2, ctx->t2, q->x, p384_mod, p384_mp_mod);
        ctx->state = 7;
        break;
    case 7:
        /* S1 = Y1*Z2^3 */
        sp_384_mont_mul_15(ctx->t3, ctx->t3, p->y, p384_mod, p384_mp_mod);
        ctx->state = 8;
        break;
    case 8:
        /* S2 = Y2*Z1^3 */
        sp_384_mont_mul_15(ctx->t4, ctx->t4, q->y, p384_mod, p384_mp_mod);
        ctx->state = 9;
        break;
    case 9:
        /* Check double */
        if ((~p->infinity) & (~q->infinity) &
                sp_384_cmp_equal_15(ctx->t2, ctx->t1) &
                sp_384_cmp_equal_15(ctx->t4, ctx->t3)) {
            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
            sp_384_proj_point_dbl_15(r, p, t);
            ctx->state = 25;
        }
        else {
            ctx->state = 10;
        }
        break;
    case 10:
        /* H = U2 - U1 */
        sp_384_mont_sub_15(ctx->t2, ctx->t2, ctx->t1, p384_mod);
        ctx->state = 11;
        break;
    case 11:
        /* R = S2 - S1 */
        sp_384_mont_sub_15(ctx->t4, ctx->t4, ctx->t3, p384_mod);
        ctx->state = 12;
        break;
    case 12:
        /* X3 = R^2 - H^3 - 2*U1*H^2 */
        sp_384_mont_sqr_15(ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
        ctx->state = 13;
        break;
    case 13:
        sp_384_mont_mul_15(ctx->y, ctx->t1, ctx->t5, p384_mod, p384_mp_mod);
        ctx->state = 14;
        break;
    case 14:
        sp_384_mont_mul_15(ctx->t5, ctx->t5, ctx->t2, p384_mod, p384_mp_mod);
        ctx->state = 15;
        break;
    case 15:
        /* Z3 = H*Z1*Z2 */
        sp_384_mont_mul_15(ctx->z, p->z, ctx->t2, p384_mod, p384_mp_mod);
        ctx->state = 16;
        break;
    case 16:
        sp_384_mont_mul_15(ctx->z, ctx->z, q->z, p384_mod, p384_mp_mod);
        ctx->state = 17;
        break;
    case 17:
        sp_384_mont_sqr_15(ctx->x, ctx->t4, p384_mod, p384_mp_mod);
        ctx->state = 18;
        break;
    case 18:
        sp_384_mont_sub_15(ctx->x, ctx->x, ctx->t5, p384_mod);
        ctx->state = 19;
        break;
    case 19:
        sp_384_mont_mul_15(ctx->t5, ctx->t5, ctx->t3, p384_mod, p384_mp_mod);
        ctx->state = 20;
        break;
    case 20:
        sp_384_mont_dbl_15(ctx->t3, ctx->y, p384_mod);
        sp_384_mont_sub_15(ctx->x, ctx->x, ctx->t3, p384_mod);
        ctx->state = 21;
        break;
    case 21:
        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
        sp_384_mont_sub_15(ctx->y, ctx->y, ctx->x, p384_mod);
        ctx->state = 22;
        break;
    case 22:
        sp_384_mont_mul_15(ctx->y, ctx->y, ctx->t4, p384_mod, p384_mp_mod);
        ctx->state = 23;
        break;
    case 23:
        sp_384_mont_sub_15(ctx->y, ctx->y, ctx->t5, p384_mod);
        ctx->state = 24;
        break;
    case 24:
    {
        {
            int i;
            sp_digit maskp = 0 - (q->infinity & (!p->infinity));
            sp_digit maskq = 0 - (p->infinity & (!q->infinity));
            sp_digit maskt = ~(maskp | maskq);
            sp_digit inf = (sp_digit)(p->infinity & q->infinity);

            for (i = 0; i < 15; i++) {
                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
                          (ctx->x[i] & maskt);
            }
            for (i = 0; i < 15; i++) {
                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
                          (ctx->y[i] & maskt);
            }
            for (i = 0; i < 15; i++) {
                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
                          (ctx->z[i] & maskt);
            }
            r->z[0] |= inf;
            r->infinity = (word32)inf;
        }
        ctx->state = 25;
        break;
    }
    case 25:
        err = MP_OKAY;
        break;
    }

    if (err == MP_OKAY && ctx->state != 25) {
        err = FP_WOULDBLOCK;
    }
    return err;
}
#endif /* WOLFSSL_SP_NONBLOCK */

/* Multiply a number by Montgomery normalizer mod modulus (prime).
 *
 * r  The resulting Montgomery form number.
 * a  The number to convert.
 * m  The modulus (prime).
 * returns MEMORY_E when memory allocation fails and MP_OKAY otherwise.
 */
static int sp_384_mod_mul_norm_15(sp_digit* r, const sp_digit* a, const sp_digit* m)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    int64_t* t = NULL;
#else
    int64_t t[2 * 12];
#endif
    int64_t* a32 = NULL;
    int64_t o;
    int err = MP_OKAY;

    (void)m;

#ifdef WOLFSSL_SP_SMALL_STACK
    t = (int64_t*)XMALLOC(sizeof(int64_t) * 2 * 12, NULL, DYNAMIC_TYPE_ECC);
    if (t == NULL)
        err = MEMORY_E;
#endif

    if (err == MP_OKAY) {
        a32 = t + 12;

        a32[0] = a[0];
        a32[0] |= a[1] << 26U;
        a32[0] &= 0xffffffffL;
        a32[1] = (a[1] >> 6);
        a32[1] |= a[2] << 20U;
        a32[1] &= 0xffffffffL;
        a32[2] = (a[2] >> 12);
        a32[2] |= a[3] << 14U;
        a32[2] &= 0xffffffffL;
        a32[3] = (a[3] >> 18);
        a32[3] |= a[4] << 8U;
        a32[3] &= 0xffffffffL;
        a32[4] = (a[4] >> 24);
        a32[4] |= a[5] << 2U;
        a32[4] |= a[6] << 28U;
        a32[4] &= 0xffffffffL;
        a32[5] = (a[6] >> 4);
        a32[5] |= a[7] << 22U;
        a32[5] &= 0xffffffffL;
        a32[6] = (a[7] >> 10);
        a32[6] |= a[8] << 16U;
        a32[6] &= 0xffffffffL;
        a32[7] = (a[8] >> 16);
        a32[7] |= a[9] << 10U;
        a32[7] &= 0xffffffffL;
        a32[8] = (a[9] >> 22);
        a32[8] |= a[10] << 4U;
        a32[8] |= a[11] << 30U;
        a32[8] &= 0xffffffffL;
        a32[9] = (a[11] >> 2);
        a32[9] |= a[12] << 24U;
        a32[9] &= 0xffffffffL;
        a32[10] = (a[12] >> 8);
        a32[10] |= a[13] << 18U;
        a32[10] &= 0xffffffffL;
        a32[11] = (a[13] >> 14);
        a32[11] |= a[14] << 12U;
        a32[11] &= 0xffffffffL;

        /*  1  0  0  0  0  0  0  0  1  1  0 -1 */
        t[0] = 0 + a32[0] + a32[8] + a32[9] - a32[11];
        /* -1  1  0  0  0  0  0  0 -1  0  1  1 */
        t[1] = 0 - a32[0] + a32[1] - a32[8] + a32[10] + a32[11];
        /*  0 -1  1  0  0  0  0  0  0 -1  0  1 */
        t[2] = 0 - a32[1] + a32[2] - a32[9] + a32[11];
        /*  1  0 -1  1  0  0  0  0  1  1 -1 -1 */
        t[3] = 0 + a32[0] - a32[2] + a32[3] + a32[8] + a32[9] - a32[10] - a32[11];
        /*  1  1  0 -1  1  0  0  0  1  2  1 -2 */
        t[4] = 0 + a32[0] + a32[1] - a32[3] + a32[4] + a32[8] + 2 * a32[9] + a32[10] -  2 * a32[11];
        /*  0  1  1  0 -1  1  0  0  0  1  2  1 */
        t[5] = 0 + a32[1] + a32[2] - a32[4] + a32[5] + a32[9] + 2 * a32[10] + a32[11];
        /*  0  0  1  1  0 -1  1  0  0  0  1  2 */
        t[6] = 0 + a32[2] + a32[3] - a32[5] + a32[6] + a32[10] + 2 * a32[11];
        /*  0  0  0  1  1  0 -1  1  0  0  0  1 */
        t[7] = 0 + a32[3] + a32[4] - a32[6] + a32[7] + a32[11];
        /*  0  0  0  0  1  1  0 -1  1  0  0  0 */
        t[8] = 0 + a32[4] + a32[5] - a32[7] + a32[8];
        /*  0  0  0  0  0  1  1  0 -1  1  0  0 */
        t[9] = 0 + a32[5] + a32[6] - a32[8] + a32[9];
        /*  0  0  0  0  0  0  1  1  0 -1  1  0 */
        t[10] = 0 + a32[6] + a32[7] - a32[9] + a32[10];
        /*  0  0  0  0  0  0  0  1  1  0 -1  1 */
        t[11] = 0 + a32[7] + a32[8] - a32[10] + a32[11];

        t[1] += t[0] >> 32; t[0] &= 0xffffffff;
        t[2] += t[1] >> 32; t[1] &= 0xffffffff;
        t[3] += t[2] >> 32; t[2] &= 0xffffffff;
        t[4] += t[3] >> 32; t[3] &= 0xffffffff;
        t[5] += t[4] >> 32; t[4] &= 0xffffffff;
        t[6] += t[5] >> 32; t[5] &= 0xffffffff;
        t[7] += t[6] >> 32; t[6] &= 0xffffffff;
        t[8] += t[7] >> 32; t[7] &= 0xffffffff;
        t[9] += t[8] >> 32; t[8] &= 0xffffffff;
        t[10] += t[9] >> 32; t[9] &= 0xffffffff;
        t[11] += t[10] >> 32; t[10] &= 0xffffffff;
        o     = t[11] >> 32; t[11] &= 0xffffffff;
        t[0] += o;
        t[1] -= o;
        t[3] += o;
        t[4] += o;
        t[1] += t[0] >> 32; t[0] &= 0xffffffff;
        t[2] += t[1] >> 32; t[1] &= 0xffffffff;
        t[3] += t[2] >> 32; t[2] &= 0xffffffff;
        t[4] += t[3] >> 32; t[3] &= 0xffffffff;
        t[5] += t[4] >> 32; t[4] &= 0xffffffff;
        t[6] += t[5] >> 32; t[5] &= 0xffffffff;
        t[7] += t[6] >> 32; t[6] &= 0xffffffff;
        t[8] += t[7] >> 32; t[7] &= 0xffffffff;
        t[9] += t[8] >> 32; t[8] &= 0xffffffff;
        t[10] += t[9] >> 32; t[9] &= 0xffffffff;
        t[11] += t[10] >> 32; t[10] &= 0xffffffff;

        r[0] = (sp_digit)(t[0]) & 0x3ffffffL;
        r[1] = (sp_digit)(t[0] >> 26U);
        r[1] |= (sp_digit)(t[1] << 6U);
        r[1] &= 0x3ffffffL;
        r[2] = (sp_digit)(t[1] >> 20U);
        r[2] |= (sp_digit)(t[2] << 12U);
        r[2] &= 0x3ffffffL;
        r[3] = (sp_digit)(t[2] >> 14U);
        r[3] |= (sp_digit)(t[3] << 18U);
        r[3] &= 0x3ffffffL;
        r[4] = (sp_digit)(t[3] >> 8U);
        r[4] |= (sp_digit)(t[4] << 24U);
        r[4] &= 0x3ffffffL;
        r[5] = (sp_digit)(t[4] >> 2U) & 0x3ffffffL;
        r[6] = (sp_digit)(t[4] >> 28U);
        r[6] |= (sp_digit)(t[5] << 4U);
        r[6] &= 0x3ffffffL;
        r[7] = (sp_digit)(t[5] >> 22U);
        r[7] |= (sp_digit)(t[6] << 10U);
        r[7] &= 0x3ffffffL;
        r[8] = (sp_digit)(t[6] >> 16U);
        r[8] |= (sp_digit)(t[7] << 16U);
        r[8] &= 0x3ffffffL;
        r[9] = (sp_digit)(t[7] >> 10U);
        r[9] |= (sp_digit)(t[8] << 22U);
        r[9] &= 0x3ffffffL;
        r[10] = (sp_digit)(t[8] >> 4U) & 0x3ffffffL;
        r[11] = (sp_digit)(t[8] >> 30U);
        r[11] |= (sp_digit)(t[9] << 2U);
        r[11] &= 0x3ffffffL;
        r[12] = (sp_digit)(t[9] >> 24U);
        r[12] |= (sp_digit)(t[10] << 8U);
        r[12] &= 0x3ffffffL;
        r[13] = (sp_digit)(t[10] >> 18U);
        r[13] |= (sp_digit)(t[11] << 14U);
        r[13] &= 0x3ffffffL;
        r[14] = (sp_digit)(t[11] >> 12U);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t != NULL)
        XFREE(t, NULL, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

#ifdef WOLFSSL_SP_SMALL
/* Multiply the point by the scalar and return the result.
 * If map is true then convert result to affine coordinates.
 *
 * Small implementation using add and double that is cache attack resistant but
 * allocates memory rather than use large stacks.
 * 384 adds and doubles.
 *
 * r     Resulting point.
 * g     Point to multiply.
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 * ct    Constant time required.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_384_ecc_mulmod_15(sp_point_384* r, const sp_point_384* g,
        const sp_digit* k, int map, int ct, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_384* t = NULL;
    sp_digit* tmp = NULL;
#else
    sp_point_384 t[3];
    sp_digit tmp[2 * 15 * 6];
#endif
    sp_digit n;
    int i;
    int c;
    int y;
    int err = MP_OKAY;

    /* Implementation is constant time. */
    (void)ct;
    (void)heap;

#ifdef WOLFSSL_SP_SMALL_STACK
    t = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 3, heap,
                                     DYNAMIC_TYPE_ECC);
    if (t == NULL)
        err = MEMORY_E;
    if (err == MP_OKAY) {
        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 15 * 6, heap,
                                 DYNAMIC_TYPE_ECC);
        if (tmp == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        XMEMSET(t, 0, sizeof(sp_point_384) * 3);

        /* t[0] = {0, 0, 1} * norm */
        t[0].infinity = 1;
        /* t[1] = {g->x, g->y, g->z} * norm */
        err = sp_384_mod_mul_norm_15(t[1].x, g->x, p384_mod);
    }
    if (err == MP_OKAY)
        err = sp_384_mod_mul_norm_15(t[1].y, g->y, p384_mod);
    if (err == MP_OKAY)
        err = sp_384_mod_mul_norm_15(t[1].z, g->z, p384_mod);

    if (err == MP_OKAY) {
        i = 14;
        c = 20;
        n = k[i--] << (26 - c);
        for (; ; c--) {
            if (c == 0) {
                if (i == -1)
                    break;

                n = k[i--];
                c = 26;
            }

            y = (n >> 25) & 1;
            n <<= 1;

            sp_384_proj_point_add_15(&t[y^1], &t[0], &t[1], tmp);

            XMEMCPY(&t[2], (void*)(((size_t)&t[0] & addr_mask[y^1]) +
                                   ((size_t)&t[1] & addr_mask[y])),
                    sizeof(sp_point_384));
            sp_384_proj_point_dbl_15(&t[2], &t[2], tmp);
            XMEMCPY((void*)(((size_t)&t[0] & addr_mask[y^1]) +
                            ((size_t)&t[1] & addr_mask[y])), &t[2],
                    sizeof(sp_point_384));
        }

        if (map != 0) {
            sp_384_map_15(r, &t[0], tmp);
        }
        else {
            XMEMCPY(r, &t[0], sizeof(sp_point_384));
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (tmp != NULL)
#endif
    {
        ForceZero(tmp, sizeof(sp_digit) * 2 * 15 * 6);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(tmp, heap, DYNAMIC_TYPE_ECC);
    #endif
    }
#ifdef WOLFSSL_SP_SMALL_STACK
    if (t != NULL)
#endif
    {
        ForceZero(t, sizeof(sp_point_384) * 3);
    #ifdef WOLFSSL_SP_SMALL_STACK
        XFREE(t, heap, DYNAMIC_TYPE_ECC);
    #endif
    }

    return err;
}

#ifdef WOLFSSL_SP_NONBLOCK
typedef struct sp_384_ecc_mulmod_15_ctx {
    int state;
    union {
        sp_384_proj_point_dbl_15_ctx dbl_ctx;
        sp_384_proj_point_add_15_ctx add_ctx;
    };
    sp_point_384 t[3];
    sp_digit tmp[2 * 15 * 6];
    sp_digit n;
    int i;
    int c;
    int y;
} sp_384_ecc_mulmod_15_ctx;

static int sp_384_ecc_mulmod_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
    const sp_point_384* g, const sp_digit* k, int map, int ct, void* heap)
{
    int err = FP_WOULDBLOCK;
    sp_384_ecc_mulmod_15_ctx* ctx = (sp_384_ecc_mulmod_15_ctx*)sp_ctx->data;

    typedef char ctx_size_test[sizeof(sp_384_ecc_mulmod_15_ctx) >= sizeof(*sp_ctx) ? -1 : 1];
    (void)sizeof(ctx_size_test);

    /* Implementation is constant time. */
    (void)ct;

    switch (ctx->state) {
    case 0: /* INIT */
        XMEMSET(ctx->t, 0, sizeof(sp_point_384) * 3);
        ctx->i = 14;
        ctx->c = 20;
        ctx->n = k[ctx->i--] << (26 - ctx->c);

        /* t[0] = {0, 0, 1} * norm */
        ctx->t[0].infinity = 1;
        ctx->state = 1;
        break;
    case 1: /* T1X */
        /* t[1] = {g->x, g->y, g->z} * norm */
        err = sp_384_mod_mul_norm_15(ctx->t[1].x, g->x, p384_mod);
        ctx->state = 2;
        break;
    case 2: /* T1Y */
        err = sp_384_mod_mul_norm_15(ctx->t[1].y, g->y, p384_mod);
        ctx->state = 3;
        break;
    case 3: /* T1Z */
        err = sp_384_mod_mul_norm_15(ctx->t[1].z, g->z, p384_mod);
        ctx->state = 4;
        break;
    case 4: /* ADDPREP */
        if (ctx->c == 0) {
            if (ctx->i == -1) {
                ctx->state = 7;
                break;
            }

            ctx->n = k[ctx->i--];
            ctx->c = 26;
        }
        ctx->y = (ctx->n >> 25) & 1;
        ctx->n <<= 1;
        XMEMSET(&ctx->add_ctx, 0, sizeof(ctx->add_ctx));
        ctx->state = 5;
        break;
    case 5: /* ADD */
        err = sp_384_proj_point_add_15_nb((sp_ecc_ctx_t*)&ctx->add_ctx,
            &ctx->t[ctx->y^1], &ctx->t[0], &ctx->t[1], ctx->tmp);
        if (err == MP_OKAY) {
            XMEMCPY(&ctx->t[2], (void*)(((size_t)&ctx->t[0] & addr_mask[ctx->y^1]) +
                                        ((size_t)&ctx->t[1] & addr_mask[ctx->y])),
                    sizeof(sp_point_384));
            XMEMSET(&ctx->dbl_ctx, 0, sizeof(ctx->dbl_ctx));
            ctx->state = 6;
        }
        break;
    case 6: /* DBL */
        err = sp_384_proj_point_dbl_15_nb((sp_ecc_ctx_t*)&ctx->dbl_ctx, &ctx->t[2],
            &ctx->t[2], ctx->tmp);
        if (err == MP_OKAY) {
            XMEMCPY((void*)(((size_t)&ctx->t[0] & addr_mask[ctx->y^1]) +
                            ((size_t)&ctx->t[1] & addr_mask[ctx->y])), &ctx->t[2],
                    sizeof(sp_point_384));
            ctx->state = 4;
            ctx->c--;
        }
        break;
    case 7: /* MAP */
        if (map != 0) {
            sp_384_map_15(r, &ctx->t[0], ctx->tmp);
        }
        else {
            XMEMCPY(r, &ctx->t[0], sizeof(sp_point_384));
        }
        err = MP_OKAY;
        break;
    }

    if (err == MP_OKAY && ctx->state != 7) {
        err = FP_WOULDBLOCK;
    }
    if (err != FP_WOULDBLOCK) {
        ForceZero(ctx->tmp, sizeof(ctx->tmp));
        ForceZero(ctx->t, sizeof(ctx->t));
    }

    (void)heap;

    return err;
}

#endif /* WOLFSSL_SP_NONBLOCK */

#else
/* A table entry for pre-computed points. */
typedef struct sp_table_entry_384 {
    sp_digit x[15];
    sp_digit y[15];
} sp_table_entry_384;

/* Conditionally copy a into r using the mask m.
 * m is -1 to copy and 0 when not.
 *
 * r  A single precision number to copy over.
 * a  A single precision number to copy.
 * m  Mask value to apply.
 */
static void sp_384_cond_copy_15(sp_digit* r, const sp_digit* a, const sp_digit m)
{
    sp_digit t[15];
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i = 0; i < 15; i++) {
        t[i] = r[i] ^ a[i];
    }
    for (i = 0; i < 15; i++) {
        r[i] ^= t[i] & m;
    }
#else
    t[ 0] = r[ 0] ^ a[ 0];
    t[ 1] = r[ 1] ^ a[ 1];
    t[ 2] = r[ 2] ^ a[ 2];
    t[ 3] = r[ 3] ^ a[ 3];
    t[ 4] = r[ 4] ^ a[ 4];
    t[ 5] = r[ 5] ^ a[ 5];
    t[ 6] = r[ 6] ^ a[ 6];
    t[ 7] = r[ 7] ^ a[ 7];
    t[ 8] = r[ 8] ^ a[ 8];
    t[ 9] = r[ 9] ^ a[ 9];
    t[10] = r[10] ^ a[10];
    t[11] = r[11] ^ a[11];
    t[12] = r[12] ^ a[12];
    t[13] = r[13] ^ a[13];
    t[14] = r[14] ^ a[14];
    r[ 0] ^= t[ 0] & m;
    r[ 1] ^= t[ 1] & m;
    r[ 2] ^= t[ 2] & m;
    r[ 3] ^= t[ 3] & m;
    r[ 4] ^= t[ 4] & m;
    r[ 5] ^= t[ 5] & m;
    r[ 6] ^= t[ 6] & m;
    r[ 7] ^= t[ 7] & m;
    r[ 8] ^= t[ 8] & m;
    r[ 9] ^= t[ 9] & m;
    r[10] ^= t[10] & m;
    r[11] ^= t[11] & m;
    r[12] ^= t[12] & m;
    r[13] ^= t[13] & m;
    r[14] ^= t[14] & m;
#endif /* WOLFSSL_SP_SMALL */
}

/* Double the Montgomery form projective point p a number of times.
 *
 * r  Result of repeated doubling of point.
 * p  Point to double.
 * n  Number of times to double
 * t  Temporary ordinate data.
 */
static void sp_384_proj_point_dbl_n_15(sp_point_384* p, int i,
    sp_digit* t)
{
    sp_digit* w = t;
    sp_digit* a = t + 2*15;
    sp_digit* b = t + 4*15;
    sp_digit* t1 = t + 6*15;
    sp_digit* t2 = t + 8*15;
    sp_digit* x;
    sp_digit* y;
    sp_digit* z;
    volatile int n = i;

    x = p->x;
    y = p->y;
    z = p->z;

    /* Y = 2*Y */
    sp_384_mont_dbl_15(y, y, p384_mod);
    /* W = Z^4 */
    sp_384_mont_sqr_15(w, z, p384_mod, p384_mp_mod);
    sp_384_mont_sqr_15(w, w, p384_mod, p384_mp_mod);
#ifndef WOLFSSL_SP_SMALL
    while (--n > 0)
#else
    while (--n >= 0)
#endif
    {
        /* A = 3*(X^2 - W) */
        sp_384_mont_sqr_15(t1, x, p384_mod, p384_mp_mod);
        sp_384_mont_sub_15(t1, t1, w, p384_mod);
        sp_384_mont_tpl_15(a, t1, p384_mod);
        /* B = X*Y^2 */
        sp_384_mont_sqr_15(t1, y, p384_mod, p384_mp_mod);
        sp_384_mont_mul_15(b, t1, x, p384_mod, p384_mp_mod);
        /* X = A^2 - 2B */
        sp_384_mont_sqr_15(x, a, p384_mod, p384_mp_mod);
        sp_384_mont_dbl_15(t2, b, p384_mod);
        sp_384_mont_sub_15(x, x, t2, p384_mod);
        /* B = 2.(B - X) */
        sp_384_mont_sub_15(t2, b, x, p384_mod);
        sp_384_mont_dbl_15(b, t2, p384_mod);
        /* Z = Z*Y */
        sp_384_mont_mul_15(z, z, y, p384_mod, p384_mp_mod);
        /* t1 = Y^4 */
        sp_384_mont_sqr_15(t1, t1, p384_mod, p384_mp_mod);
#ifdef WOLFSSL_SP_SMALL
        if (n != 0)
#endif
        {
            /* W = W*Y^4 */
            sp_384_mont_mul_15(w, w, t1, p384_mod, p384_mp_mod);
        }
        /* y = 2*A*(B - X) - Y^4 */
        sp_384_mont_mul_15(y, b, a, p384_mod, p384_mp_mod);
        sp_384_mont_sub_15(y, y, t1, p384_mod);
    }
#ifndef WOLFSSL_SP_SMALL
    /* A = 3*(X^2 - W) */
    sp_384_mont_sqr_15(t1, x, p384_mod, p384_mp_mod);
    sp_384_mont_sub_15(t1, t1, w, p384_mod);
    sp_384_mont_tpl_15(a, t1, p384_mod);
    /* B = X*Y^2 */
    sp_384_mont_sqr_15(t1, y, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(b, t1, x, p384_mod, p384_mp_mod);
    /* X = A^2 - 2B */
    sp_384_mont_sqr_15(x, a, p384_mod, p384_mp_mod);
    sp_384_mont_dbl_15(t2, b, p384_mod);
    sp_384_mont_sub_15(x, x, t2, p384_mod);
    /* B = 2.(B - X) */
    sp_384_mont_sub_15(t2, b, x, p384_mod);
    sp_384_mont_dbl_15(b, t2, p384_mod);
    /* Z = Z*Y */
    sp_384_mont_mul_15(z, z, y, p384_mod, p384_mp_mod);
    /* t1 = Y^4 */
    sp_384_mont_sqr_15(t1, t1, p384_mod, p384_mp_mod);
    /* y = 2*A*(B - X) - Y^4 */
    sp_384_mont_mul_15(y, b, a, p384_mod, p384_mp_mod);
    sp_384_mont_sub_15(y, y, t1, p384_mod);
#endif /* WOLFSSL_SP_SMALL */
    /* Y = Y/2 */
    sp_384_mont_div2_15(y, y, p384_mod);
}

/* Double the Montgomery form projective point p a number of times.
 *
 * r  Result of repeated doubling of point.
 * p  Point to double.
 * n  Number of times to double
 * t  Temporary ordinate data.
 */
static void sp_384_proj_point_dbl_n_store_15(sp_point_384* r,
        const sp_point_384* p, int n, int m, sp_digit* t)
{
    sp_digit* w = t;
    sp_digit* a = t + 2*15;
    sp_digit* b = t + 4*15;
    sp_digit* t1 = t + 6*15;
    sp_digit* t2 = t + 8*15;
    sp_digit* x = r[2*m].x;
    sp_digit* y = r[(1<<n)*m].y;
    sp_digit* z = r[2*m].z;
    int i;
    int j;

    for (i=0; i<15; i++) {
        x[i] = p->x[i];
    }
    for (i=0; i<15; i++) {
        y[i] = p->y[i];
    }
    for (i=0; i<15; i++) {
        z[i] = p->z[i];
    }

    /* Y = 2*Y */
    sp_384_mont_dbl_15(y, y, p384_mod);
    /* W = Z^4 */
    sp_384_mont_sqr_15(w, z, p384_mod, p384_mp_mod);
    sp_384_mont_sqr_15(w, w, p384_mod, p384_mp_mod);
    j = m;
    for (i=1; i<=n; i++) {
        j *= 2;

        /* A = 3*(X^2 - W) */
        sp_384_mont_sqr_15(t1, x, p384_mod, p384_mp_mod);
        sp_384_mont_sub_15(t1, t1, w, p384_mod);
        sp_384_mont_tpl_15(a, t1, p384_mod);
        /* B = X*Y^2 */
        sp_384_mont_sqr_15(t1, y, p384_mod, p384_mp_mod);
        sp_384_mont_mul_15(b, t1, x, p384_mod, p384_mp_mod);
        x = r[j].x;
        /* X = A^2 - 2B */
        sp_384_mont_sqr_15(x, a, p384_mod, p384_mp_mod);
        sp_384_mont_dbl_15(t2, b, p384_mod);
        sp_384_mont_sub_15(x, x, t2, p384_mod);
        /* B = 2.(B - X) */
        sp_384_mont_sub_15(t2, b, x, p384_mod);
        sp_384_mont_dbl_15(b, t2, p384_mod);
        /* Z = Z*Y */
        sp_384_mont_mul_15(r[j].z, z, y, p384_mod, p384_mp_mod);
        z = r[j].z;
        /* t1 = Y^4 */
        sp_384_mont_sqr_15(t1, t1, p384_mod, p384_mp_mod);
        if (i != n) {
            /* W = W*Y^4 */
            sp_384_mont_mul_15(w, w, t1, p384_mod, p384_mp_mod);
        }
        /* y = 2*A*(B - X) - Y^4 */
        sp_384_mont_mul_15(y, b, a, p384_mod, p384_mp_mod);
        sp_384_mont_sub_15(y, y, t1, p384_mod);
        /* Y = Y/2 */
        sp_384_mont_div2_15(r[j].y, y, p384_mod);
        r[j].infinity = 0;
    }
}

/* Add two Montgomery form projective points.
 *
 * ra  Result of addition.
 * rs  Result of subtraction.
 * p   First point to add.
 * q   Second point to add.
 * t   Temporary ordinate data.
 */
static void sp_384_proj_point_add_sub_15(sp_point_384* ra,
        sp_point_384* rs, const sp_point_384* p, const sp_point_384* q,
        sp_digit* t)
{
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2*15;
    sp_digit* t3 = t + 4*15;
    sp_digit* t4 = t + 6*15;
    sp_digit* t5 = t + 8*15;
    sp_digit* t6 = t + 10*15;
    sp_digit* xa = ra->x;
    sp_digit* ya = ra->y;
    sp_digit* za = ra->z;
    sp_digit* xs = rs->x;
    sp_digit* ys = rs->y;
    sp_digit* zs = rs->z;


    XMEMCPY(xa, p->x, sizeof(p->x) / 2);
    XMEMCPY(ya, p->y, sizeof(p->y) / 2);
    XMEMCPY(za, p->z, sizeof(p->z) / 2);
    ra->infinity = 0;
    rs->infinity = 0;

    /* U1 = X1*Z2^2 */
    sp_384_mont_sqr_15(t1, q->z, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(t3, t1, q->z, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(t1, t1, xa, p384_mod, p384_mp_mod);
    /* U2 = X2*Z1^2 */
    sp_384_mont_sqr_15(t2, za, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(t4, t2, za, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(t2, t2, q->x, p384_mod, p384_mp_mod);
    /* S1 = Y1*Z2^3 */
    sp_384_mont_mul_15(t3, t3, ya, p384_mod, p384_mp_mod);
    /* S2 = Y2*Z1^3 */
    sp_384_mont_mul_15(t4, t4, q->y, p384_mod, p384_mp_mod);
    /* H = U2 - U1 */
    sp_384_mont_sub_15(t2, t2, t1, p384_mod);
    /* RS = S2 + S1 */
    sp_384_mont_add_15(t6, t4, t3, p384_mod);
    /* R = S2 - S1 */
    sp_384_mont_sub_15(t4, t4, t3, p384_mod);
    /* Z3 = H*Z1*Z2 */
    /* ZS = H*Z1*Z2 */
    sp_384_mont_mul_15(za, za, q->z, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(za, za, t2, p384_mod, p384_mp_mod);
    XMEMCPY(zs, za, sizeof(p->z)/2);
    /* X3 = R^2 - H^3 - 2*U1*H^2 */
    /* XS = RS^2 - H^3 - 2*U1*H^2 */
    sp_384_mont_sqr_15(xa, t4, p384_mod, p384_mp_mod);
    sp_384_mont_sqr_15(xs, t6, p384_mod, p384_mp_mod);
    sp_384_mont_sqr_15(t5, t2, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(ya, t1, t5, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(t5, t5, t2, p384_mod, p384_mp_mod);
    sp_384_mont_sub_15(xa, xa, t5, p384_mod);
    sp_384_mont_sub_15(xs, xs, t5, p384_mod);
    sp_384_mont_dbl_15(t1, ya, p384_mod);
    sp_384_mont_sub_15(xa, xa, t1, p384_mod);
    sp_384_mont_sub_15(xs, xs, t1, p384_mod);
    /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
    /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */
    sp_384_mont_sub_15(ys, ya, xs, p384_mod);
    sp_384_mont_sub_15(ya, ya, xa, p384_mod);
    sp_384_mont_mul_15(ya, ya, t4, p384_mod, p384_mp_mod);
    sp_384_sub_15(t6, p384_mod, t6);
    sp_384_mont_mul_15(ys, ys, t6, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(t5, t5, t3, p384_mod, p384_mp_mod);
    sp_384_mont_sub_15(ya, ya, t5, p384_mod);
    sp_384_mont_sub_15(ys, ys, t5, p384_mod);
}

/* Structure used to describe recoding of scalar multiplication. */
typedef struct ecc_recode_384 {
    /* Index into pre-computation table. */
    uint8_t i;
    /* Use the negative of the point. */
    uint8_t neg;
} ecc_recode_384;

/* The index into pre-computation table to use. */
static const uint8_t recode_index_15_6[66] = {
     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
    32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
    16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
     0,  1,
};

/* Whether to negate y-ordinate. */
static const uint8_t recode_neg_15_6[66] = {
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
     0,  0,
};

/* Recode the scalar for multiplication using pre-computed values and
 * subtraction.
 *
 * k  Scalar to multiply by.
 * v  Vector of operations to perform.
 */
static void sp_384_ecc_recode_6_15(const sp_digit* k, ecc_recode_384* v)
{
    int i;
    int j;
    uint8_t y;
    int carry = 0;
    int o;
    sp_digit n;

    j = 0;
    n = k[j];
    o = 0;
    for (i=0; i<65; i++) {
        y = (int8_t)n;
        if (o + 6 < 26) {
            y &= 0x3f;
            n >>= 6;
            o += 6;
        }
        else if (o + 6 == 26) {
            n >>= 6;
            if (++j < 15)
                n = k[j];
            o = 0;
        }
        else if (++j < 15) {
            n = k[j];
            y |= (uint8_t)((n << (26 - o)) & 0x3f);
            o -= 20;
            n >>= o;
        }

        y += (uint8_t)carry;
        v[i].i = recode_index_15_6[y];
        v[i].neg = recode_neg_15_6[y];
        carry = (y >> 6) + v[i].neg;
    }
}

#ifndef WC_NO_CACHE_RESISTANT
/* Touch each possible point that could be being copied.
 *
 * r      Point to copy into.
 * table  Table - start of the entries to access
 * idx    Index of entry to retrieve.
 */
static void sp_384_get_point_33_15(sp_point_384* r, const sp_point_384* table,
    int idx)
{
    int i;
    sp_digit mask;

    r->x[0] = 0;
    r->x[1] = 0;
    r->x[2] = 0;
    r->x[3] = 0;
    r->x[4] = 0;
    r->x[5] = 0;
    r->x[6] = 0;
    r->x[7] = 0;
    r->x[8] = 0;
    r->x[9] = 0;
    r->x[10] = 0;
    r->x[11] = 0;
    r->x[12] = 0;
    r->x[13] = 0;
    r->x[14] = 0;
    r->y[0] = 0;
    r->y[1] = 0;
    r->y[2] = 0;
    r->y[3] = 0;
    r->y[4] = 0;
    r->y[5] = 0;
    r->y[6] = 0;
    r->y[7] = 0;
    r->y[8] = 0;
    r->y[9] = 0;
    r->y[10] = 0;
    r->y[11] = 0;
    r->y[12] = 0;
    r->y[13] = 0;
    r->y[14] = 0;
    r->z[0] = 0;
    r->z[1] = 0;
    r->z[2] = 0;
    r->z[3] = 0;
    r->z[4] = 0;
    r->z[5] = 0;
    r->z[6] = 0;
    r->z[7] = 0;
    r->z[8] = 0;
    r->z[9] = 0;
    r->z[10] = 0;
    r->z[11] = 0;
    r->z[12] = 0;
    r->z[13] = 0;
    r->z[14] = 0;
    for (i = 1; i < 33; i++) {
        mask = 0 - (i == idx);
        r->x[0] |= mask & table[i].x[0];
        r->x[1] |= mask & table[i].x[1];
        r->x[2] |= mask & table[i].x[2];
        r->x[3] |= mask & table[i].x[3];
        r->x[4] |= mask & table[i].x[4];
        r->x[5] |= mask & table[i].x[5];
        r->x[6] |= mask & table[i].x[6];
        r->x[7] |= mask & table[i].x[7];
        r->x[8] |= mask & table[i].x[8];
        r->x[9] |= mask & table[i].x[9];
        r->x[10] |= mask & table[i].x[10];
        r->x[11] |= mask & table[i].x[11];
        r->x[12] |= mask & table[i].x[12];
        r->x[13] |= mask & table[i].x[13];
        r->x[14] |= mask & table[i].x[14];
        r->y[0] |= mask & table[i].y[0];
        r->y[1] |= mask & table[i].y[1];
        r->y[2] |= mask & table[i].y[2];
        r->y[3] |= mask & table[i].y[3];
        r->y[4] |= mask & table[i].y[4];
        r->y[5] |= mask & table[i].y[5];
        r->y[6] |= mask & table[i].y[6];
        r->y[7] |= mask & table[i].y[7];
        r->y[8] |= mask & table[i].y[8];
        r->y[9] |= mask & table[i].y[9];
        r->y[10] |= mask & table[i].y[10];
        r->y[11] |= mask & table[i].y[11];
        r->y[12] |= mask & table[i].y[12];
        r->y[13] |= mask & table[i].y[13];
        r->y[14] |= mask & table[i].y[14];
        r->z[0] |= mask & table[i].z[0];
        r->z[1] |= mask & table[i].z[1];
        r->z[2] |= mask & table[i].z[2];
        r->z[3] |= mask & table[i].z[3];
        r->z[4] |= mask & table[i].z[4];
        r->z[5] |= mask & table[i].z[5];
        r->z[6] |= mask & table[i].z[6];
        r->z[7] |= mask & table[i].z[7];
        r->z[8] |= mask & table[i].z[8];
        r->z[9] |= mask & table[i].z[9];
        r->z[10] |= mask & table[i].z[10];
        r->z[11] |= mask & table[i].z[11];
        r->z[12] |= mask & table[i].z[12];
        r->z[13] |= mask & table[i].z[13];
        r->z[14] |= mask & table[i].z[14];
    }
}
#endif /* !WC_NO_CACHE_RESISTANT */
/* Multiply the point by the scalar and return the result.
 * If map is true then convert result to affine coordinates.
 *
 * Window technique of 6 bits. (Add-Sub variation.)
 * Calculate 0..32 times the point. Use function that adds and
 * subtracts the same two points.
 * Recode to add or subtract one of the computed points.
 * Double to push up.
 * NOT a sliding window.
 *
 * r     Resulting point.
 * g     Point to multiply.
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 * ct    Constant time required.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_384_ecc_mulmod_win_add_sub_15(sp_point_384* r, const sp_point_384* g,
        const sp_digit* k, int map, int ct, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_384* t = NULL;
    sp_digit* tmp = NULL;
#else
    sp_point_384 t[33+2];
    sp_digit tmp[2 * 15 * 6];
#endif
    sp_point_384* rt = NULL;
    sp_point_384* p = NULL;
    sp_digit* negy;
    int i;
    ecc_recode_384 v[65];
    int err = MP_OKAY;

    /* Constant time used for cache attack resistance implementation. */
    (void)ct;
    (void)heap;

#ifdef WOLFSSL_SP_SMALL_STACK
    t = (sp_point_384*)XMALLOC(sizeof(sp_point_384) *
        (33+2), heap, DYNAMIC_TYPE_ECC);
    if (t == NULL)
        err = MEMORY_E;
    if (err == MP_OKAY) {
        tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 15 * 6,
                                 heap, DYNAMIC_TYPE_ECC);
        if (tmp == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        rt = t + 33;
        p  = t + 33+1;

        /* t[0] = {0, 0, 1} * norm */
        XMEMSET(&t[0], 0, sizeof(t[0]));
        t[0].infinity = 1;
        /* t[1] = {g->x, g->y, g->z} * norm */
        err = sp_384_mod_mul_norm_15(t[1].x, g->x, p384_mod);
    }
    if (err == MP_OKAY) {
        err = sp_384_mod_mul_norm_15(t[1].y, g->y, p384_mod);
    }
    if (err == MP_OKAY) {
        err = sp_384_mod_mul_norm_15(t[1].z, g->z, p384_mod);
    }

    if (err == MP_OKAY) {
        t[1].infinity = 0;
        /* t[2] ... t[32]  */
        sp_384_proj_point_dbl_n_store_15(t, &t[ 1], 5, 1, tmp);
        sp_384_proj_point_add_15(&t[ 3], &t[ 2], &t[ 1], tmp);
        sp_384_proj_point_dbl_15(&t[ 6], &t[ 3], tmp);
        sp_384_proj_point_add_sub_15(&t[ 7], &t[ 5], &t[ 6], &t[ 1], tmp);
        sp_384_proj_point_dbl_15(&t[10], &t[ 5], tmp);
        sp_384_proj_point_add_sub_15(&t[11], &t[ 9], &t[10], &t[ 1], tmp);
        sp_384_proj_point_dbl_15(&t[12], &t[ 6], tmp);
        sp_384_proj_point_dbl_15(&t[14], &t[ 7], tmp);
        sp_384_proj_point_add_sub_15(&t[15], &t[13], &t[14], &t[ 1], tmp);
        sp_384_proj_point_dbl_15(&t[18], &t[ 9], tmp);
        sp_384_proj_point_add_sub_15(&t[19], &t[17], &t[18], &t[ 1], tmp);
        sp_384_proj_point_dbl_15(&t[20], &t[10], tmp);
        sp_384_proj_point_dbl_15(&t[22], &t[11], tmp);
        sp_384_proj_point_add_sub_15(&t[23], &t[21], &t[22], &t[ 1], tmp);
        sp_384_proj_point_dbl_15(&t[24], &t[12], tmp);
        sp_384_proj_point_dbl_15(&t[26], &t[13], tmp);
        sp_384_proj_point_add_sub_15(&t[27], &t[25], &t[26], &t[ 1], tmp);
        sp_384_proj_point_dbl_15(&t[28], &t[14], tmp);
        sp_384_proj_point_dbl_15(&t[30], &t[15], tmp);
        sp_384_proj_point_add_sub_15(&t[31], &t[29], &t[30], &t[ 1], tmp);

        negy = t[0].y;

        sp_384_ecc_recode_6_15(k, v);

        i = 64;
    #ifndef WC_NO_CACHE_RESISTANT
        if (ct) {
            sp_384_get_point_33_15(rt, t, v[i].i);
            rt->infinity = !v[i].i;
        }
        else
    #endif
        {
            XMEMCPY(rt, &t[v[i].i], sizeof(sp_point_384));
        }
        for (--i; i>=0; i--) {
            sp_384_proj_point_dbl_n_15(rt, 6, tmp);

        #ifndef WC_NO_CACHE_RESISTANT
            if (ct) {
                sp_384_get_point_33_15(p, t, v[i].i);
                p->infinity = !v[i].i;
            }
            else
        #endif
            {
                XMEMCPY(p, &t[v[i].i], sizeof(sp_point_384));
            }
            sp_384_sub_15(negy, p384_mod, p->y);
            sp_384_norm_15(negy);
            sp_384_cond_copy_15(p->y, negy, (sp_digit)0 - v[i].neg);
            sp_384_proj_point_add_15(rt, rt, p, tmp);
        }

        if (map != 0) {
            sp_384_map_15(r, rt, tmp);
        }
        else {
            XMEMCPY(r, rt, sizeof(sp_point_384));
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t != NULL)
        XFREE(t, heap, DYNAMIC_TYPE_ECC);
    if (tmp != NULL)
        XFREE(tmp, heap, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

#ifdef FP_ECC
#endif /* FP_ECC */
/* Add two Montgomery form projective points. The second point has a q value of
 * one.
 * Only the first point can be the same pointer as the result point.
 *
 * r  Result of addition.
 * p  First point to add.
 * q  Second point to add.
 * t  Temporary ordinate data.
 */
static void sp_384_proj_point_add_qz1_15(sp_point_384* r,
    const sp_point_384* p, const sp_point_384* q, sp_digit* t)
{
    sp_digit* t2 = t;
    sp_digit* t3 = t + 2*15;
    sp_digit* t6 = t + 4*15;
    sp_digit* t1 = t + 6*15;
    sp_digit* t4 = t + 8*15;
    sp_digit* t5 = t + 10*15;

    /* Calculate values to subtract from P->x and P->y. */
    /* U2 = X2*Z1^2 */
    sp_384_mont_sqr_15(t2, p->z, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(t4, t2, p->z, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(t2, t2, q->x, p384_mod, p384_mp_mod);
    /* S2 = Y2*Z1^3 */
    sp_384_mont_mul_15(t4, t4, q->y, p384_mod, p384_mp_mod);

    if ((~p->infinity) & (~q->infinity) &
            sp_384_cmp_equal_15(p->x, t2) &
            sp_384_cmp_equal_15(p->y, t4)) {
        sp_384_proj_point_dbl_15(r, p, t);
    }
    else {
        sp_digit* x = t2;
        sp_digit* y = t3;
        sp_digit* z = t6;

        /* H = U2 - X1 */
        sp_384_mont_sub_15(t2, t2, p->x, p384_mod);
        /* R = S2 - Y1 */
        sp_384_mont_sub_15(t4, t4, p->y, p384_mod);
        /* Z3 = H*Z1 */
        sp_384_mont_mul_15(z, p->z, t2, p384_mod, p384_mp_mod);
        /* X3 = R^2 - H^3 - 2*X1*H^2 */
        sp_384_mont_sqr_15(t1, t2, p384_mod, p384_mp_mod);
        sp_384_mont_mul_15(t3, p->x, t1, p384_mod, p384_mp_mod);
        sp_384_mont_mul_15(t1, t1, t2, p384_mod, p384_mp_mod);
        sp_384_mont_sqr_15(t2, t4, p384_mod, p384_mp_mod);
        sp_384_mont_sub_15(t2, t2, t1, p384_mod);
        sp_384_mont_dbl_15(t5, t3, p384_mod);
        sp_384_mont_sub_15(x, t2, t5, p384_mod);
        /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
        sp_384_mont_sub_15(t3, t3, x, p384_mod);
        sp_384_mont_mul_15(t3, t3, t4, p384_mod, p384_mp_mod);
        sp_384_mont_mul_15(t1, t1, p->y, p384_mod, p384_mp_mod);
        sp_384_mont_sub_15(y, t3, t1, p384_mod);
        {
            int i;
            sp_digit maskp = 0 - (q->infinity & (!p->infinity));
            sp_digit maskq = 0 - (p->infinity & (!q->infinity));
            sp_digit maskt = ~(maskp | maskq);
            sp_digit inf = (sp_digit)(p->infinity & q->infinity);

            for (i = 0; i < 15; i++) {
                r->x[i] = (p->x[i] & maskp) | (q->x[i] & maskq) |
                          (x[i] & maskt);
            }
            for (i = 0; i < 15; i++) {
                r->y[i] = (p->y[i] & maskp) | (q->y[i] & maskq) |
                          (y[i] & maskt);
            }
            for (i = 0; i < 15; i++) {
                r->z[i] = (p->z[i] & maskp) | (q->z[i] & maskq) |
                          (z[i] & maskt);
            }
            r->z[0] |= inf;
            r->infinity = (word32)inf;
        }
    }
}

#ifdef FP_ECC
/* Convert the projective point to affine.
 * Ordinates are in Montgomery form.
 *
 * a  Point to convert.
 * t  Temporary data.
 */
static void sp_384_proj_to_affine_15(sp_point_384* a, sp_digit* t)
{
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2 * 15;
    sp_digit* tmp = t + 4 * 15;

    sp_384_mont_inv_15(t1, a->z, tmp);

    sp_384_mont_sqr_15(t2, t1, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(t1, t2, t1, p384_mod, p384_mp_mod);

    sp_384_mont_mul_15(a->x, a->x, t2, p384_mod, p384_mp_mod);
    sp_384_mont_mul_15(a->y, a->y, t1, p384_mod, p384_mp_mod);
    XMEMCPY(a->z, p384_norm_mod, sizeof(p384_norm_mod));
}

/* Generate the pre-computed table of points for the base point.
 *
 * width = 8
 * 256 entries
 * 48 bits between
 *
 * a      The base point.
 * table  Place to store generated point data.
 * tmp    Temporary data.
 * heap  Heap to use for allocation.
 */
static int sp_384_gen_stripe_table_15(const sp_point_384* a,
        sp_table_entry_384* table, sp_digit* tmp, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_384* t = NULL;
#else
    sp_point_384 t[3];
#endif
    sp_point_384* s1 = NULL;
    sp_point_384* s2 = NULL;
    int i;
    int j;
    int err = MP_OKAY;

    (void)heap;

#ifdef WOLFSSL_SP_SMALL_STACK
    t = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 3, heap,
                                     DYNAMIC_TYPE_ECC);
    if (t == NULL)
        err = MEMORY_E;
#endif

    if (err == MP_OKAY) {
        s1 = t + 1;
        s2 = t + 2;

        err = sp_384_mod_mul_norm_15(t->x, a->x, p384_mod);
    }
    if (err == MP_OKAY) {
        err = sp_384_mod_mul_norm_15(t->y, a->y, p384_mod);
    }
    if (err == MP_OKAY) {
        err = sp_384_mod_mul_norm_15(t->z, a->z, p384_mod);
    }
    if (err == MP_OKAY) {
        t->infinity = 0;
        sp_384_proj_to_affine_15(t, tmp);

        XMEMCPY(s1->z, p384_norm_mod, sizeof(p384_norm_mod));
        s1->infinity = 0;
        XMEMCPY(s2->z, p384_norm_mod, sizeof(p384_norm_mod));
        s2->infinity = 0;

        /* table[0] = {0, 0, infinity} */
        XMEMSET(&table[0], 0, sizeof(sp_table_entry_384));
        /* table[1] = Affine version of 'a' in Montgomery form */
        XMEMCPY(table[1].x, t->x, sizeof(table->x));
        XMEMCPY(table[1].y, t->y, sizeof(table->y));

        for (i=1; i<8; i++) {
            sp_384_proj_point_dbl_n_15(t, 48, tmp);
            sp_384_proj_to_affine_15(t, tmp);
            XMEMCPY(table[1<<i].x, t->x, sizeof(table->x));
            XMEMCPY(table[1<<i].y, t->y, sizeof(table->y));
        }

        for (i=1; i<8; i++) {
            XMEMCPY(s1->x, table[1<<i].x, sizeof(table->x));
            XMEMCPY(s1->y, table[1<<i].y, sizeof(table->y));
            for (j=(1<<i)+1; j<(1<<(i+1)); j++) {
                XMEMCPY(s2->x, table[j-(1<<i)].x, sizeof(table->x));
                XMEMCPY(s2->y, table[j-(1<<i)].y, sizeof(table->y));
                sp_384_proj_point_add_qz1_15(t, s1, s2, tmp);
                sp_384_proj_to_affine_15(t, tmp);
                XMEMCPY(table[j].x, t->x, sizeof(table->x));
                XMEMCPY(table[j].y, t->y, sizeof(table->y));
            }
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t != NULL)
        XFREE(t, heap, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

#endif /* FP_ECC */
#ifndef WC_NO_CACHE_RESISTANT
/* Touch each possible entry that could be being copied.
 *
 * r      Point to copy into.
 * table  Table - start of the entries to access
 * idx    Index of entry to retrieve.
 */
static void sp_384_get_entry_256_15(sp_point_384* r,
    const sp_table_entry_384* table, int idx)
{
    int i;
    sp_digit mask;

    r->x[0] = 0;
    r->x[1] = 0;
    r->x[2] = 0;
    r->x[3] = 0;
    r->x[4] = 0;
    r->x[5] = 0;
    r->x[6] = 0;
    r->x[7] = 0;
    r->x[8] = 0;
    r->x[9] = 0;
    r->x[10] = 0;
    r->x[11] = 0;
    r->x[12] = 0;
    r->x[13] = 0;
    r->x[14] = 0;
    r->y[0] = 0;
    r->y[1] = 0;
    r->y[2] = 0;
    r->y[3] = 0;
    r->y[4] = 0;
    r->y[5] = 0;
    r->y[6] = 0;
    r->y[7] = 0;
    r->y[8] = 0;
    r->y[9] = 0;
    r->y[10] = 0;
    r->y[11] = 0;
    r->y[12] = 0;
    r->y[13] = 0;
    r->y[14] = 0;
    for (i = 1; i < 256; i++) {
        mask = 0 - (i == idx);
        r->x[0] |= mask & table[i].x[0];
        r->x[1] |= mask & table[i].x[1];
        r->x[2] |= mask & table[i].x[2];
        r->x[3] |= mask & table[i].x[3];
        r->x[4] |= mask & table[i].x[4];
        r->x[5] |= mask & table[i].x[5];
        r->x[6] |= mask & table[i].x[6];
        r->x[7] |= mask & table[i].x[7];
        r->x[8] |= mask & table[i].x[8];
        r->x[9] |= mask & table[i].x[9];
        r->x[10] |= mask & table[i].x[10];
        r->x[11] |= mask & table[i].x[11];
        r->x[12] |= mask & table[i].x[12];
        r->x[13] |= mask & table[i].x[13];
        r->x[14] |= mask & table[i].x[14];
        r->y[0] |= mask & table[i].y[0];
        r->y[1] |= mask & table[i].y[1];
        r->y[2] |= mask & table[i].y[2];
        r->y[3] |= mask & table[i].y[3];
        r->y[4] |= mask & table[i].y[4];
        r->y[5] |= mask & table[i].y[5];
        r->y[6] |= mask & table[i].y[6];
        r->y[7] |= mask & table[i].y[7];
        r->y[8] |= mask & table[i].y[8];
        r->y[9] |= mask & table[i].y[9];
        r->y[10] |= mask & table[i].y[10];
        r->y[11] |= mask & table[i].y[11];
        r->y[12] |= mask & table[i].y[12];
        r->y[13] |= mask & table[i].y[13];
        r->y[14] |= mask & table[i].y[14];
    }
}
#endif /* !WC_NO_CACHE_RESISTANT */
/* Multiply the point by the scalar and return the result.
 * If map is true then convert result to affine coordinates.
 *
 * Stripe implementation.
 * Pre-generated: 2^0, 2^48, ...
 * Pre-generated: products of all combinations of above.
 * 8 doubles and adds (with qz=1)
 *
 * r      Resulting point.
 * k      Scalar to multiply by.
 * table  Pre-computed table.
 * map    Indicates whether to convert result to affine.
 * ct     Constant time required.
 * heap   Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_384_ecc_mulmod_stripe_15(sp_point_384* r, const sp_point_384* g,
        const sp_table_entry_384* table, const sp_digit* k, int map,
        int ct, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_384* rt = NULL;
    sp_digit* t = NULL;
#else
    sp_point_384 rt[2];
    sp_digit t[2 * 15 * 6];
#endif
    sp_point_384* p = NULL;
    int i;
    int j;
    int y;
    int x;
    int err = MP_OKAY;

    (void)g;
    /* Constant time used for cache attack resistance implementation. */
    (void)ct;
    (void)heap;


#ifdef WOLFSSL_SP_SMALL_STACK
    rt = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap,
                                      DYNAMIC_TYPE_ECC);
    if (rt == NULL)
        err = MEMORY_E;
    if (err == MP_OKAY) {
        t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 15 * 6, heap,
                               DYNAMIC_TYPE_ECC);
        if (t == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        p = rt + 1;

        XMEMCPY(p->z, p384_norm_mod, sizeof(p384_norm_mod));
        XMEMCPY(rt->z, p384_norm_mod, sizeof(p384_norm_mod));

        y = 0;
        x = 47;
        for (j=0; j<8; j++) {
            y |= (int)(((k[x / 26] >> (x % 26)) & 1) << j);
            x += 48;
        }
    #ifndef WC_NO_CACHE_RESISTANT
        if (ct) {
            sp_384_get_entry_256_15(rt, table, y);
        } else
    #endif
        {
            XMEMCPY(rt->x, table[y].x, sizeof(table[y].x));
            XMEMCPY(rt->y, table[y].y, sizeof(table[y].y));
        }
        rt->infinity = !y;
        for (i=46; i>=0; i--) {
            y = 0;
            x = i;
            for (j=0; j<8; j++) {
                y |= (int)(((k[x / 26] >> (x % 26)) & 1) << j);
                x += 48;
            }

            sp_384_proj_point_dbl_15(rt, rt, t);
        #ifndef WC_NO_CACHE_RESISTANT
            if (ct) {
                sp_384_get_entry_256_15(p, table, y);
            }
            else
        #endif
            {
                XMEMCPY(p->x, table[y].x, sizeof(table[y].x));
                XMEMCPY(p->y, table[y].y, sizeof(table[y].y));
            }
            p->infinity = !y;
            sp_384_proj_point_add_qz1_15(rt, rt, p, t);
        }

        if (map != 0) {
            sp_384_map_15(r, rt, t);
        }
        else {
            XMEMCPY(r, rt, sizeof(sp_point_384));
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (t != NULL)
        XFREE(t, heap, DYNAMIC_TYPE_ECC);
    if (rt != NULL)
        XFREE(rt, heap, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

#ifdef FP_ECC
#ifndef FP_ENTRIES
    #define FP_ENTRIES 16
#endif

/* Cache entry - holds precomputation tables for a point. */
typedef struct sp_cache_384_t {
    /* X ordinate of point that table was generated from. */
    sp_digit x[15];
    /* Y ordinate of point that table was generated from. */
    sp_digit y[15];
    /* Precomputation table for point. */
    sp_table_entry_384 table[256];
    /* Count of entries in table. */
    uint32_t cnt;
    /* Point and table set in entry. */
    int set;
} sp_cache_384_t;

/* Cache of tables. */
static THREAD_LS_T sp_cache_384_t sp_cache_384[FP_ENTRIES];
/* Index of last entry in cache. */
static THREAD_LS_T int sp_cache_384_last = -1;
/* Cache has been initialized. */
static THREAD_LS_T int sp_cache_384_inited = 0;

#ifndef HAVE_THREAD_LS
    #ifndef WOLFSSL_MUTEX_INITIALIZER
    static volatile int initCacheMutex_384 = 0;
    #endif
    static wolfSSL_Mutex sp_cache_384_lock WOLFSSL_MUTEX_INITIALIZER_CLAUSE(sp_cache_384_lock);
#endif

/* Get the cache entry for the point.
 *
 * g      [in]   Point scalar multiplying.
 * cache  [out]  Cache table to use.
 */
static void sp_ecc_get_cache_384(const sp_point_384* g, sp_cache_384_t** cache)
{
    int i;
    int j;
    uint32_t least;

    if (sp_cache_384_inited == 0) {
        for (i=0; i<FP_ENTRIES; i++) {
            sp_cache_384[i].set = 0;
        }
        sp_cache_384_inited = 1;
    }

    /* Compare point with those in cache. */
    for (i=0; i<FP_ENTRIES; i++) {
        if (!sp_cache_384[i].set)
            continue;

        if (sp_384_cmp_equal_15(g->x, sp_cache_384[i].x) &
                           sp_384_cmp_equal_15(g->y, sp_cache_384[i].y)) {
            sp_cache_384[i].cnt++;
            break;
        }
    }

    /* No match. */
    if (i == FP_ENTRIES) {
        /* Find empty entry. */
        i = (sp_cache_384_last + 1) % FP_ENTRIES;
        for (; i != sp_cache_384_last; i=(i+1)%FP_ENTRIES) {
            if (!sp_cache_384[i].set) {
                break;
            }
        }

        /* Evict least used. */
        if (i == sp_cache_384_last) {
            least = sp_cache_384[0].cnt;
            for (j=1; j<FP_ENTRIES; j++) {
                if (sp_cache_384[j].cnt < least) {
                    i = j;
                    least = sp_cache_384[i].cnt;
                }
            }
        }

        XMEMCPY(sp_cache_384[i].x, g->x, sizeof(sp_cache_384[i].x));
        XMEMCPY(sp_cache_384[i].y, g->y, sizeof(sp_cache_384[i].y));
        sp_cache_384[i].set = 1;
        sp_cache_384[i].cnt = 1;
    }

    *cache = &sp_cache_384[i];
    sp_cache_384_last = i;
}
#endif /* FP_ECC */

/* Multiply the base point of P384 by the scalar and return the result.
 * If map is true then convert result to affine coordinates.
 *
 * r     Resulting point.
 * g     Point to multiply.
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 * ct    Constant time required.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_384_ecc_mulmod_15(sp_point_384* r, const sp_point_384* g,
        const sp_digit* k, int map, int ct, void* heap)
{
#ifndef FP_ECC
    return sp_384_ecc_mulmod_win_add_sub_15(r, g, k, map, ct, heap);
#else
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_digit* tmp;
#else
    sp_digit tmp[2 * 15 * 7];
#endif
    sp_cache_384_t* cache;
    int err = MP_OKAY;

#ifdef WOLFSSL_SP_SMALL_STACK
    tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 15 * 7, heap, DYNAMIC_TYPE_ECC);
    if (tmp == NULL) {
        err = MEMORY_E;
    }
#endif
#ifndef HAVE_THREAD_LS
    if (err == MP_OKAY) {
        #ifndef WOLFSSL_MUTEX_INITIALIZER
        if (initCacheMutex_384 == 0) {
            wc_InitMutex(&sp_cache_384_lock);
            initCacheMutex_384 = 1;
        }
        #endif
        if (wc_LockMutex(&sp_cache_384_lock) != 0) {
            err = BAD_MUTEX_E;
        }
    }
#endif /* HAVE_THREAD_LS */

    if (err == MP_OKAY) {
        sp_ecc_get_cache_384(g, &cache);
        if (cache->cnt == 2)
            sp_384_gen_stripe_table_15(g, cache->table, tmp, heap);

#ifndef HAVE_THREAD_LS
        wc_UnLockMutex(&sp_cache_384_lock);
#endif /* HAVE_THREAD_LS */

        if (cache->cnt < 2) {
            err = sp_384_ecc_mulmod_win_add_sub_15(r, g, k, map, ct, heap);
        }
        else {
            err = sp_384_ecc_mulmod_stripe_15(r, g, cache->table, k,
                    map, ct, heap);
        }
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    XFREE(tmp, heap, DYNAMIC_TYPE_ECC);
#endif
    return err;
#endif
}

#endif
/* Multiply the point by the scalar and return the result.
 * If map is true then convert result to affine coordinates.
 *
 * km    Scalar to multiply by.
 * p     Point to multiply.
 * r     Resulting point.
 * map   Indicates whether to convert result to affine.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
int sp_ecc_mulmod_384(const mp_int* km, const ecc_point* gm, ecc_point* r,
        int map, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_384* point = NULL;
    sp_digit* k = NULL;
#else
    sp_point_384 point[1];
    sp_digit k[15];
#endif
    int err = MP_OKAY;

#ifdef WOLFSSL_SP_SMALL_STACK
    point = (sp_point_384*)XMALLOC(sizeof(sp_point_384), heap,
                                         DYNAMIC_TYPE_ECC);
    if (point == NULL)
        err = MEMORY_E;
    if (err == MP_OKAY) {
        k = (sp_digit*)XMALLOC(sizeof(sp_digit) * 15, heap,
                               DYNAMIC_TYPE_ECC);
        if (k == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        sp_384_from_mp(k, 15, km);
        sp_384_point_from_ecc_point_15(point, gm);

            err = sp_384_ecc_mulmod_15(point, point, k, map, 1, heap);
    }
    if (err == MP_OKAY) {
        err = sp_384_point_to_ecc_point_15(point, r);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (k != NULL)
        XFREE(k, heap, DYNAMIC_TYPE_ECC);
    if (point != NULL)
        XFREE(point, heap, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

/* Multiply the point by the scalar, add point a and return the result.
 * If map is true then convert result to affine coordinates.
 *
 * km      Scalar to multiply by.
 * p       Point to multiply.
 * am      Point to add to scalar multiply result.
 * inMont  Point to add is in montgomery form.
 * r       Resulting point.
 * map     Indicates whether to convert result to affine.
 * heap    Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
int sp_ecc_mulmod_add_384(const mp_int* km, const ecc_point* gm,
    const ecc_point* am, int inMont, ecc_point* r, int map, void* heap)
{
#ifdef WOLFSSL_SP_SMALL_STACK
    sp_point_384* point = NULL;
    sp_digit* k = NULL;
#else
    sp_point_384 point[2];
    sp_digit k[15 + 15 * 2 * 6];
#endif
    sp_point_384* addP = NULL;
    sp_digit* tmp = NULL;
    int err = MP_OKAY;

#ifdef WOLFSSL_SP_SMALL_STACK
    point = (sp_point_384*)XMALLOC(sizeof(sp_point_384) * 2, heap,
                                         DYNAMIC_TYPE_ECC);
    if (point == NULL)
        err = MEMORY_E;
    if (err == MP_OKAY) {
        k = (sp_digit*)XMALLOC(
            sizeof(sp_digit) * (15 + 15 * 2 * 6), heap,
            DYNAMIC_TYPE_ECC);
        if (k == NULL)
            err = MEMORY_E;
    }
#endif

    if (err == MP_OKAY) {
        addP = point + 1;
        tmp = k + 15;

        sp_384_from_mp(k, 15, km);
        sp_384_point_from_ecc_point_15(point, gm);
        sp_384_point_from_ecc_point_15(addP, am);
    }
    if ((err == MP_OKAY) && (!inMont)) {
        err = sp_384_mod_mul_norm_15(addP->x, addP->x, p384_mod);
    }
    if ((err == MP_OKAY) && (!inMont)) {
        err = sp_384_mod_mul_norm_15(addP->y, addP->y, p384_mod);
    }
    if ((err == MP_OKAY) && (!inMont)) {
        err = sp_384_mod_mul_norm_15(addP->z, addP->z, p384_mod);
    }
    if (err == MP_OKAY) {
            err = sp_384_ecc_mulmod_15(point, point, k, 0, 0, heap);
    }
    if (err == MP_OKAY) {
            sp_384_proj_point_add_15(point, point, addP, tmp);

        if (map) {
                sp_384_map_15(point, point, tmp);
        }

        err = sp_384_point_to_ecc_point_15(point, r);
    }

#ifdef WOLFSSL_SP_SMALL_STACK
    if (k != NULL)
        XFREE(k, heap, DYNAMIC_TYPE_ECC);
    if (point != NULL)
        XFREE(point, heap, DYNAMIC_TYPE_ECC);
#endif

    return err;
}

#ifdef WOLFSSL_SP_SMALL
/* Multiply the base point of P384 by the scalar and return the result.
 * If map is true then convert result to affine coordinates.
 *
 * r     Resulting point.
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_384_ecc_mulmod_base_15(sp_point_384* r, const sp_digit* k,
        int map, int ct, void* heap)
{
    /* No pre-computed values. */
    return sp_384_ecc_mulmod_15(r, &p384_base, k, map, ct, heap);
}

#ifdef WOLFSSL_SP_NONBLOCK
static int sp_384_ecc_mulmod_base_15_nb(sp_ecc_ctx_t* sp_ctx, sp_point_384* r,
        const sp_digit* k, int map, int ct, void* heap)
{
    /* No pre-computed values. */
    return sp_384_ecc_mulmod_15_nb(sp_ctx, r, &p384_base, k, map, ct, heap);
}
#endif /* WOLFSSL_SP_NONBLOCK */


#else
/* Striping precomputation table.
 * 8 points combined into a table of 256 points.
 * Distance of 48 between points.
 */
static const sp_table_entry_384 p384_table[256] = {
    /* 0 */
    { { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00 },
      { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00 } },
    /* 1 */
    { { 0x1c0b528,0x01d5992,0x0e383dd,0x38a835b,0x220e378,0x106d35b,
        0x1c3afc5,0x03bfe1e,0x28459a3,0x2d91521,0x214ede2,0x0bfdc8d,
        0x2151381,0x3708a67,0x004d3aa },
      { 0x303a4fe,0x10f6b52,0x29ac230,0x2fdeed2,0x0a1bfa8,0x3a0ec14,
        0x2de7562,0x3ff662e,0x21968f4,0x031b0d4,0x3969a84,0x2000898,
        0x1c5e9dd,0x2f09685,0x002b78a } },
    /* 2 */
    { { 0x30c535b,0x191d4ca,0x2296298,0x14dc141,0x090dd69,0x05aae6b,
        0x0cd6b42,0x35da80e,0x3b7be12,0x2cf7e6d,0x1f347bd,0x3d365e1,
        0x1448913,0x32704fa,0x00222c5 },
      { 0x280dc64,0x39e5bc9,0x24175f8,0x2dd60d4,0x0120e7c,0x041d02e,
        0x0b5d8ad,0x37b9895,0x2fb5337,0x1f0e2e3,0x14f0224,0x2230b86,
        0x1bc4cf6,0x17cdb09,0x007b5c7 } },
    /* 3 */
    { { 0x2dffea5,0x28f30e7,0x29fce26,0x070df5f,0x235bbfd,0x2f78fbd,
        0x27700d9,0x23d6bc3,0x3471a53,0x0c0e03a,0x05bf9eb,0x276a2ec,
        0x20c3e2e,0x31cc691,0x00dbb93 },
      { 0x126b605,0x2e8983d,0x153737d,0x23bf5e1,0x295d497,0x35ca812,
        0x2d793ae,0x16c6893,0x3777600,0x089a520,0x1e681f8,0x3d55ee6,
        0x154ef99,0x155f592,0x00ae5f9 } },
    /* 4 */
    { { 0x26feef9,0x20315fc,0x1240244,0x250e838,0x3c31a26,0x1cf8af1,
        0x1002c32,0x3b531cd,0x1c53ef1,0x22310ba,0x3f4948e,0x22eafd9,
        0x3863202,0x3d0e2a5,0x006a502 },
      { 0x34536fe,0x04e91ad,0x30ebf5f,0x2af62a7,0x01d218b,0x1c8c9da,
        0x336bcc3,0x23060c3,0x331576e,0x1b14c5e,0x1bbcb76,0x0755e9a,
        0x3d4dcef,0x24c2cf8,0x00917c4 } },
    /* 5 */
    { { 0x349ddd0,0x09b8bb8,0x0250114,0x3e66cbf,0x29f117e,0x3005d29,
        0x36b480e,0x2119bfc,0x2761845,0x253d2f7,0x0580604,0x0bb6db4,
        0x3ca922f,0x1744677,0x008adc7 },
      { 0x3d5a7ce,0x27425ed,0x11e9a61,0x3968d10,0x3874275,0x3692d3b,
        0x03e0470,0x0763d50,0x3d97790,0x3cbaeab,0x2747170,0x18faf3a,
        0x180365e,0x2511fe7,0x0012a36 } },
    /* 6 */
    { { 0x3c52870,0x2701e93,0x296128f,0x120694e,0x1ce0b37,0x3860a36,
        0x10fa180,0x0896b55,0x2f76adb,0x22892ae,0x2e58a34,0x07b4295,
        0x2cb62d1,0x079a522,0x00f3d81 },
      { 0x061ed22,0x2375dd3,0x3c9d861,0x3e602d1,0x10bb747,0x39ae156,
        0x3f796fd,0x087a48a,0x06d680a,0x37f7f47,0x2af2c9d,0x36c55dc,
        0x10f3dc0,0x279b07a,0x00a0937 } },
    /* 7 */
    { { 0x085c629,0x319bbf8,0x089a386,0x184256f,0x15fc2a4,0x00fd2d0,
        0x13d6312,0x363d44d,0x32b7e4b,0x25f2865,0x27df8ce,0x1dce02a,
        0x24ea3b0,0x0e27b9f,0x00d8a90 },
      { 0x3b14461,0x1d371f9,0x0f781bc,0x0503271,0x0dc2cb0,0x13bc284,
        0x34b3a68,0x1ff894a,0x25d2032,0x16f79ba,0x260f961,0x07b10d5,
        0x18173b7,0x2812e2b,0x00eede5 } },
    /* 8 */
    { { 0x13b9a2d,0x132ece2,0x0c5d558,0x02c0214,0x1820c66,0x37cb50f,
        0x26d8267,0x3a00504,0x3f00109,0x33756ee,0x38172f1,0x2e4bb8c,
        0x030d985,0x3e4fcc5,0x00609d4 },
      { 0x2daf9d6,0x16681fa,0x1fb01e0,0x1b03c49,0x370e653,0x183c839,
        0x2207515,0x0ea6b58,0x1ae7aaf,0x3a96522,0x24bae14,0x1c38bd9,
        0x082497b,0x1c05db4,0x000dd03 } },
    /* 9 */
    { { 0x110521f,0x04efa21,0x0c174cc,0x2a7dc93,0x387315b,0x14f7098,
        0x1d83bb3,0x2495ed2,0x2fe0c27,0x1e2d9df,0x093c953,0x0287073,
        0x02c9951,0x336291c,0x0033e30 },
      { 0x208353f,0x3f22748,0x2b2bf0f,0x2373b50,0x10170fa,0x1b8a97d,
        0x0851ed2,0x0b25824,0x055ecb5,0x12049d9,0x3fe1adf,0x11b1385,
        0x28eab06,0x11fac21,0x00513f0 } },
    /* 10 */
    { { 0x35bdf53,0x1847d37,0x1a6dc07,0x29d62c4,0x045d331,0x313b8e5,
        0x165daf1,0x1e34562,0x3e75a58,0x16ea2fa,0x02dd302,0x3302862,
        0x3eb8bae,0x2266a48,0x00cf2a3 },
      { 0x24fd048,0x324a074,0x025df98,0x1662eec,0x3841bfb,0x26ae754,
        0x1df8cec,0x0113ae3,0x0b67fef,0x094e293,0x2323666,0x0ab087c,
        0x2f06509,0x0e142d9,0x00a919d } },
    /* 11 */
    { { 0x1d480d8,0x00ed021,0x3a7d3db,0x1e46ca1,0x28cd9f4,0x2a3ceeb,
        0x24dc754,0x0624a3c,0x0003db4,0x1520bae,0x1c56e0f,0x2fe7ace,
        0x1dc6f38,0x0c826a4,0x008b977 },
      { 0x209cfc2,0x2c16c9c,0x1b70a31,0x21416cb,0x34c49bf,0x186549e,
        0x062498d,0x146e959,0x0391fac,0x08ff944,0x2b4b834,0x013d57a,
        0x2eabffb,0x0370131,0x00c07c1 } },
    /* 12 */
    { { 0x332f048,0x0bf9336,0x16dfad2,0x2451d7b,0x35f23bf,0x299adb2,
        0x0ce0c0a,0x0170294,0x289f034,0x2b7d89e,0x395e2d6,0x1d20df7,
        0x2e64e36,0x16dae90,0x00081c9 },
      { 0x31d6ceb,0x0f80db9,0x0271eba,0x33db1ac,0x1b45bcc,0x1a11c07,
        0x347e630,0x148fd9e,0x142e712,0x3183e3e,0x1cd47ad,0x108d1c9,
        0x09cbb82,0x35e61d9,0x0083027 } },
    /* 13 */
    { { 0x215b0b8,0x0a7a98d,0x2c41b39,0x3f69536,0x0b41441,0x16da8da,
        0x15d556b,0x3c17a26,0x129167e,0x3ea0351,0x2d25a27,0x2f2d285,
        0x15b68f6,0x2931ef5,0x00210d6 },
      { 0x1351130,0x012aec9,0x37ebf38,0x26640f8,0x01d2df6,0x2130972,
        0x201efc0,0x23a457c,0x087a1c6,0x14c68a3,0x163f62a,0x36b494d,
        0x015d481,0x39c35b1,0x005dd6d } },
    /* 14 */
    { { 0x06612ce,0x11c3f61,0x199729f,0x3b36863,0x2986f3e,0x3cd2be1,
        0x04c1612,0x2be2dae,0x00846dd,0x3d7bc29,0x249e795,0x1016803,
        0x37a3714,0x2c5aa8b,0x005f491 },
      { 0x341b38d,0x01eb936,0x3caac7f,0x27863ef,0x1ef7d11,0x1110ec6,
        0x18e0761,0x26498e8,0x01a79a1,0x390d5a1,0x22226fb,0x3d2a473,
        0x0872191,0x1230f32,0x00dc772 } },
    /* 15 */
    { { 0x0b1ec9d,0x03fc6b9,0x3706d57,0x03b9fbb,0x221d23e,0x2867821,
        0x1e40f4c,0x2c9c0f3,0x3c4cd4b,0x31f5948,0x3f13aa6,0x307c1b2,
        0x04b6016,0x116b453,0x005aa72 },
      { 0x0b74de8,0x20519d1,0x134e37f,0x05d882a,0x1839e7a,0x3a2c6a8,
        0x0d14e8d,0x1d78bdd,0x251f30d,0x3a1e27e,0x081c261,0x2c9014b,
        0x165ee09,0x19e0cf1,0x00654e2 } },
    /* 16 */
    { { 0x39fbe67,0x081778b,0x0e44378,0x20dfdca,0x1c4afcb,0x20b803c,
        0x0ec06c6,0x1508f6f,0x1c3114d,0x3bca851,0x3a52463,0x07661d1,
        0x17b0aa0,0x16c5f5c,0x00fc093 },
      { 0x0d01f95,0x0ef13f5,0x2d34965,0x2a25582,0x39aa83e,0x3e38fcf,
        0x3943dca,0x385bbdd,0x210e86f,0x3dc1dd2,0x3f9ffdc,0x18b9bc6,
        0x345c96b,0x0e79621,0x008a72f } },
    /* 17 */
    { { 0x341c342,0x3793688,0x042273a,0x153a9c1,0x3dd326e,0x1d073bc,
        0x2c7d983,0x05524cd,0x00d59e6,0x347abe8,0x3d9a3ef,0x0fb624a,
        0x2c7e4cd,0x09b3171,0x0003faf },
      { 0x045f8ac,0x38bf3cc,0x1e73087,0x0c85d3c,0x314a655,0x382be69,
        0x384f28f,0x24d6cb3,0x2842cdc,0x1777f5e,0x2929c89,0x03c45ed,
        0x3cfcc4c,0x0b59322,0x0035657 } },
    /* 18 */
    { { 0x18c1bba,0x2eb005f,0x33d57ec,0x30e42c3,0x36058f9,0x1865f43,
        0x2116e3f,0x2c4a2bb,0x0684033,0x0f1375c,0x0209b98,0x2136e9b,
        0x1bc4af0,0x0b3e0c7,0x0097c7c },
      { 0x16010e8,0x398777e,0x2a172f4,0x0814a7e,0x0d97e4e,0x274dfc8,
        0x2666606,0x1b5c93b,0x1ed3d36,0x3f3304e,0x13488e0,0x02dbb88,
        0x2d53369,0x3717ce9,0x007cad1 } },
    /* 19 */
    { { 0x257a41f,0x2a6a076,0x39b6660,0x04bb000,0x1e74a04,0x3876b45,
        0x343c6b5,0x0753108,0x3f54668,0x24a13cf,0x23749e8,0x0421fc5,
        0x32f13b5,0x0f31be7,0x00070f2 },
      { 0x1186e14,0x0847697,0x0dff542,0x0dff76c,0x084748f,0x2c7d060,
        0x23aab4d,0x0b43906,0x27ba640,0x1497b59,0x02f5835,0x0a492a4,
        0x0a6892f,0x39f3e91,0x005844e } },
    /* 20 */
    { { 0x33b236f,0x02181cf,0x21dafab,0x0760788,0x019e9d4,0x249ed0a,
        0x36571e3,0x3c7dbcf,0x1337550,0x010d22a,0x285e62f,0x19ee65a,
        0x052bf71,0x1d65fd5,0x0062d43 },
      { 0x2955926,0x3fae7bc,0x0353d85,0x07db7de,0x1440a56,0x328dad6,
        0x1668ec9,0x28058e2,0x1a1a22d,0x1014afc,0x3609325,0x3effdcb,
        0x209f3bd,0x3ca3888,0x0094e50 } },
    /* 21 */
    { { 0x062e8af,0x0b96ccc,0x136990b,0x1d7a28f,0x1a85723,0x0076dec,
        0x21b00b2,0x06a88ff,0x2f0ee65,0x1fa49b7,0x39b10ad,0x10b26fa,
        0x0be7465,0x026e8bf,0x00098e3 },
      { 0x3f1d63f,0x37bacff,0x1374779,0x02882ff,0x323d0e8,0x1da3de5,
        0x12bb3b8,0x0a15a11,0x34d1f95,0x2b3dd6e,0x29ea3fa,0x39ad000,
        0x33a538f,0x390204d,0x0012bd3 } },
    /* 22 */
    { { 0x04cbba5,0x0de0344,0x1d4cc02,0x11fe8d7,0x36207e7,0x32a6da8,
        0x0239281,0x1ec40d7,0x3e89798,0x213fc66,0x0022eee,0x11daefe,
        0x3e74db8,0x28534ee,0x00aa0a4 },
      { 0x07d4543,0x250cc46,0x206620f,0x1c1e7db,0x1321538,0x31fa0b8,
        0x30f74ea,0x01aae0e,0x3a2828f,0x3e9dd22,0x026ef35,0x3c0a62b,
        0x27dbdc5,0x01c23a6,0x000f0c5 } },
    /* 23 */
    { { 0x2f029dd,0x3091337,0x21b80c5,0x21e1419,0x13dabc6,0x3847660,
        0x12b865f,0x36eb666,0x38f6274,0x0ba6006,0x098da24,0x1398c64,
        0x13d08e5,0x246a469,0x009929a },
      { 0x1285887,0x3ff5c8d,0x010237b,0x097c506,0x0bc7594,0x34b9b88,
        0x00cc35f,0x0bb964a,0x00cfbc4,0x29cd718,0x0837619,0x2b4a192,
        0x0c57bb7,0x08c69de,0x00a3627 } },
    /* 24 */
    { { 0x1361ed8,0x266d724,0x366cae7,0x1d5b18c,0x247d71b,0x2c9969a,
        0x0dd5211,0x1edd153,0x25998d7,0x0380856,0x3ab29db,0x09366de,
        0x1e53644,0x2b31ff6,0x008b0ff },
      { 0x3b5d9ef,0x217448d,0x174746d,0x18afea4,0x15b106d,0x3e66e8b,
        0x0479f85,0x13793b4,0x1231d10,0x3c39bce,0x25e8983,0x2a13210,
        0x05a7083,0x382be04,0x00a9507 } },
    /* 25 */
    { { 0x0cf381c,0x1a29b85,0x31ccf6c,0x2f708b8,0x3af9d27,0x2a29732,
        0x168d4da,0x393488d,0x2c0e338,0x3f90c7b,0x0f52ad1,0x2a0a3fa,
        0x2cd80f1,0x15e7a1a,0x00db6a0 },
      { 0x107832a,0x159cb91,0x1289288,0x17e21f9,0x073fc27,0x1584342,
        0x3802780,0x3d6c197,0x154075f,0x16366d1,0x09f712b,0x23a3ec4,
        0x29cf23a,0x3218baf,0x0039f0a } },
    /* 26 */
    { { 0x052edf5,0x2afde13,0x2e53d8f,0x3969626,0x3dcd737,0x1e46ac5,
        0x118bf0d,0x01b2652,0x156bcff,0x16d7ef6,0x1ca46d4,0x34c0cbb,
        0x3e486f6,0x1f85068,0x002cdff },
      { 0x1f47ec8,0x12cee98,0x0608667,0x18fbbe1,0x08a8821,0x31a1fe4,
        0x17c7054,0x3c89e89,0x2edf6cd,0x1b8c32c,0x3f6ea84,0x1319329,
        0x3cd3c2c,0x05f331a,0x00186fa } },
    /* 27 */
    { { 0x1fcb91e,0x0fd4d87,0x358a48a,0x04d91b4,0x083595e,0x044a1e6,
        0x15827b9,0x1d5eaf4,0x2b82187,0x08f3984,0x21bd737,0x0c54285,
        0x2f56887,0x14c2d98,0x00f4684 },
      { 0x01896f6,0x0e542d0,0x2090883,0x269dfcf,0x1e11cb8,0x239fd29,
        0x312cac4,0x19dfacb,0x369f606,0x0cc4f75,0x16579f9,0x33c22cc,
        0x0f22bfd,0x3b251ae,0x006429c } },
    /* 28 */
    { { 0x375f9a4,0x137552e,0x3570498,0x2e4a74e,0x24aef06,0x35b9307,
        0x384ca23,0x3bcd6d7,0x011b083,0x3c93187,0x392ca9f,0x129ce48,
        0x0a800ce,0x145d9cc,0x00865d6 },
      { 0x22b4a2b,0x37f9d9c,0x3e0eca3,0x3e5ec20,0x112c04b,0x2e1ae29,
        0x3ce5b51,0x0f83200,0x32d6a7e,0x10ff1d8,0x081adbe,0x265c30b,
        0x216b1c8,0x0eb4483,0x003cbcd } },
    /* 29 */
    { { 0x030ce93,0x2d331fb,0x20a2fbf,0x1f6dc9c,0x010ed6c,0x1ed5540,
        0x275bf74,0x3df0fb1,0x103333f,0x0241c96,0x1075bfc,0x30e5cf9,
        0x0f31bc7,0x32c01eb,0x00b049e },
      { 0x358839c,0x1dbabd3,0x1e4fb40,0x36a8ac1,0x2101896,0x2d0319b,
        0x2033b0a,0x192e8fd,0x2ebc8d8,0x2867ba7,0x07bf6d2,0x1b3c555,
        0x2477deb,0x198fe09,0x008e5a9 } },
    /* 30 */
    { { 0x3fbd5e1,0x18bf77d,0x2b1d69e,0x151da44,0x338ecfe,0x0768efe,
        0x1a3d56d,0x3c35211,0x10e1c86,0x2012525,0x3bc36ce,0x32b6fe4,
        0x0c8d183,0x15c93f3,0x0041fce },
      { 0x332c144,0x24e70a0,0x246e05f,0x22c21c7,0x2b17f24,0x1ba2bfd,
        0x0534e26,0x318a4f6,0x1dc3b85,0x0c741bc,0x23131b7,0x01a8cba,
        0x364e5db,0x21362cf,0x00f2951 } },
    /* 31 */
    { { 0x2ddc103,0x14ffdcd,0x206fd96,0x0de57bd,0x025f43e,0x381b73a,
        0x2301fcf,0x3bafc27,0x34130b6,0x0216bc8,0x0ff56b2,0x2c4ad4c,
        0x23c6b79,0x1267fa6,0x009b4fb },
      { 0x1d27ac2,0x13e2494,0x1389015,0x38d5b29,0x2d33167,0x3f01969,
        0x28ec1fa,0x1b26de0,0x2587f74,0x1c25668,0x0c44f83,0x23c6f8c,
        0x32fdbb1,0x045f104,0x00a7946 } },
    /* 32 */
    { { 0x23c647b,0x09addd7,0x1348c04,0x0e633c1,0x1bfcbd9,0x1cb034f,
        0x1312e31,0x11cdcc7,0x1e6ee75,0x057d27f,0x2da7ee6,0x154c3c1,
        0x3a5fb89,0x2c2ba2c,0x00cf281 },
      { 0x1b8a543,0x125cd50,0x1d30fd1,0x29cc203,0x341a625,0x14e4233,
        0x3aae076,0x289e38a,0x036ba02,0x230f405,0x3b21b8f,0x34088b9,
        0x01297a0,0x03a75fb,0x00fdc27 } },
    /* 33 */
    { { 0x07f41d6,0x1cf032f,0x1641008,0x0f86deb,0x3d97611,0x0e110fe,
        0x136ff42,0x0b914a9,0x0e241e6,0x180c340,0x1f545fc,0x0ba619d,
        0x1208c53,0x04223a4,0x00cd033 },
      { 0x397612c,0x0132665,0x34e2d1a,0x00bba99,0x1d4393e,0x065d0a8,
        0x2fa69ee,0x1643b55,0x08085f0,0x3774aad,0x08a2243,0x33bf149,
        0x03f41a5,0x1ed950e,0x0048cc6 } },
    /* 34 */
    { { 0x014ab48,0x010c3bf,0x2a744e5,0x13c99c1,0x2195b7f,0x32207fd,
        0x28a228c,0x004f4bf,0x0e2d945,0x2ec6e5a,0x0b92162,0x1aa95e5,
        0x2754a93,0x1adcd93,0x004fb76 },
      { 0x1e1ff7f,0x24ef28c,0x269113f,0x32b393c,0x2696eb5,0x0ac2780,
        0x354bf8a,0x0ffe3fd,0x09ce58e,0x0163c4f,0x1678c0b,0x15cd1bc,
        0x292b3b7,0x036ea19,0x00d5420 } },
    /* 35 */
    { { 0x1da1265,0x0c2ef5b,0x18dd9a0,0x3f3a25c,0x0f7b4f3,0x0d8196e,
        0x24931f9,0x090729a,0x1875f72,0x1ef39cb,0x2577585,0x2ed472d,
        0x136756c,0x20553a6,0x00c7161 },
      { 0x2e32189,0x283de4b,0x00b2e81,0x0989df7,0x3ef2fab,0x1c7d1a7,
        0x24f6feb,0x3e16679,0x233dfda,0x06d1233,0x3e6b5df,0x1707132,
        0x05f7b3f,0x2c00779,0x00fb8df } },
    /* 36 */
    { { 0x15bb921,0x117e9d3,0x267ec73,0x2f934ad,0x25c7e04,0x20b5e8f,
        0x2d3a802,0x2ca911f,0x3f87e47,0x39709dd,0x08488e2,0x2cec400,
        0x35b4589,0x1f0acba,0x009aad7 },
      { 0x2ac34ae,0x06f29f6,0x3326d68,0x3949abe,0x02452e4,0x0687b85,
        0x0879244,0x1eb7832,0x0d4c240,0x31d0ec1,0x3c17a2a,0x17a666f,
        0x01a06cb,0x3e0929c,0x004dca2 } },
    /* 37 */
    { { 0x127bc1a,0x0c72984,0x13be68e,0x26c5fab,0x1a3edd5,0x097d685,
        0x36b645e,0x385799e,0x394a420,0x39d8885,0x0b1e872,0x13f60ed,
        0x2ce1b79,0x3c0ecb7,0x007cab3 },
      { 0x29b3586,0x26fc572,0x0bd7711,0x0913494,0x0a55459,0x31af3c9,
        0x3633eac,0x3e2105c,0x0c2b1b6,0x0e6f4c2,0x047d38c,0x2b81bd5,
        0x1fe1c3b,0x04d7cd0,0x0054dcc } },
    /* 38 */
    { { 0x03caf0d,0x0d66365,0x313356d,0x2a4897f,0x2ce044e,0x18feb7a,
        0x1f6a7c5,0x3709e7b,0x14473e8,0x2d8cbae,0x3190dca,0x12d19f8,
        0x31e3181,0x3cc5b6e,0x002d4f4 },
      { 0x143b7ca,0x2604728,0x39508d6,0x0cb79f3,0x24ec1ac,0x1ed7fa0,
        0x3ab5fd3,0x3c76488,0x2e49390,0x03a0985,0x3580461,0x3fd2c81,
        0x308f0ab,0x38561d6,0x0011b9b } },
    /* 39 */
    { { 0x3be682c,0x0c68f4e,0x32dd4ae,0x099d3bb,0x0bc7c5d,0x311f750,
        0x2fd10a3,0x2e7864a,0x23bc14a,0x13b1f82,0x32e495e,0x1b0f746,
        0x3cd856a,0x17a4c26,0x00085ee },
      { 0x02e67fd,0x06a4223,0x2af2f38,0x2038987,0x132083a,0x1b7bb85,
        0x0d6a499,0x131e43f,0x3035e52,0x278ee3e,0x1d5b08b,0x30d8364,
        0x2719f8d,0x0b21fc9,0x003a06e } },
    /* 40 */
    { { 0x237cac0,0x27d6a1c,0x27945cd,0x2750d61,0x293f0b5,0x253db13,
        0x04a764e,0x20b4d0e,0x12bb627,0x160c13b,0x0de0601,0x236e2cf,
        0x2190f0b,0x354d76f,0x004336d },
      { 0x2ab473a,0x10d54e4,0x1046574,0x1d6f97b,0x0031c72,0x06426a9,
        0x38678c2,0x0b76cf9,0x04f9920,0x152adf8,0x2977e63,0x1234819,
        0x198be26,0x061024c,0x00d427d } },
    /* 41 */
    { { 0x39b5a31,0x2123d43,0x362a822,0x1a2eab6,0x0bb0034,0x0d5d567,
        0x3a04723,0x3a10c8c,0x08079ae,0x0d27bda,0x2eb9e1e,0x2619e82,
        0x39a55a8,0x0c6c7db,0x00c1519 },
      { 0x174251e,0x13ac2eb,0x295ed26,0x18d2afc,0x037b9b2,0x1258344,
        0x00921b0,0x1f702d8,0x1bc4da7,0x1c3794f,0x12b1869,0x366eacf,
        0x16ddf01,0x31ebdc5,0x00ad54e } },
    /* 42 */
    { { 0x1efdc58,0x1370d5e,0x0ddb8e7,0x1a53fda,0x1456bd3,0x0c825a9,
        0x0e74ccd,0x20f41c9,0x3423867,0x139073f,0x3c70d8a,0x131fc85,
        0x219a2a0,0x34bf986,0x0041199 },
      { 0x1c05dd2,0x268f80a,0x3da9d38,0x1af9f8f,0x0535f2a,0x30ad37e,
        0x2cf72d7,0x14a509b,0x1f4fe74,0x259e09d,0x1d23f51,0x0672732,
        0x08fc463,0x00b6201,0x001e05a } },
    /* 43 */
    { { 0x0d5ffe8,0x3238bb5,0x17f275c,0x25b6fa8,0x2f8bb48,0x3b8f2d2,
        0x059790c,0x18594d4,0x285a47c,0x3d301bb,0x12935d2,0x23ffc96,
        0x3d7c7f9,0x15c8cbf,0x0034c4a },
      { 0x20376a2,0x05201ba,0x1e02c4b,0x1413c45,0x02ea5e7,0x39575f0,
        0x2d76e21,0x113694c,0x011f310,0x0da3725,0x31b7799,0x1cb9195,
        0x0cfd592,0x22ee4ea,0x00adaa3 } },
    /* 44 */
    { { 0x14ed72a,0x031c49f,0x39a34bf,0x192e87d,0x0da0e92,0x130e7a9,
        0x00258bf,0x144e123,0x2d82a71,0x0294e53,0x3f06c66,0x3d4473a,
        0x037cd4a,0x3bbfb17,0x00fcebc },
      { 0x39ae8c1,0x2dd6a9d,0x206ef23,0x332b479,0x2deff59,0x09d5720,
        0x3526fd2,0x33bf7cf,0x344bb32,0x359316a,0x115bdef,0x1b8468a,
        0x3813ea9,0x11a8450,0x00ab197 } },
    /* 45 */
    { { 0x0837d7d,0x1e1617b,0x0ba443c,0x2f2e3b8,0x2ca5b6f,0x176ed7b,
        0x2924d9d,0x07294d3,0x104bb4f,0x1cfd3e8,0x398640f,0x1162dc8,
        0x007ea15,0x2aa75fd,0x004231f },
      { 0x16e6896,0x01987be,0x0f9d53e,0x1a740ec,0x1554e4c,0x31e1634,
        0x3cb07b9,0x013eb53,0x39352cb,0x1dfa549,0x0974e7f,0x17c55d2,
        0x157c85f,0x1561adb,0x002e3fa } },
    /* 46 */
    { { 0x29951a8,0x35200da,0x2ad042c,0x22109e4,0x3a8b15b,0x2eca69c,
        0x28bcf9a,0x0cfa063,0x0924099,0x12ff668,0x2fb88dc,0x028d653,
        0x2445876,0x218d01c,0x0014418 },
      { 0x1caedc7,0x295bba6,0x01c9162,0x3364744,0x28fb12e,0x24c80b6,
        0x2719673,0x35e5ba9,0x04aa4cc,0x206ab23,0x1cf185a,0x2c140d8,
        0x1095a7d,0x1b3633f,0x000c9f8 } },
    /* 47 */
    { { 0x0b2a556,0x0a051c4,0x30b29a7,0x190c9ed,0x3767ca9,0x38de66d,
        0x2d9e125,0x3aca813,0x2dc22a3,0x319e074,0x0d9450a,0x3445bac,
        0x3e08a5b,0x07f29fa,0x00eccac },
      { 0x02d6e94,0x21113f7,0x321bde6,0x0a4d7b3,0x03621f4,0x2780e8b,
        0x22d5432,0x1fc2853,0x0d57d3e,0x254f90b,0x33ed00b,0x289b025,
        0x12272bb,0x30e715f,0x0000297 } },
    /* 48 */
    { { 0x0243a7d,0x2aac42e,0x0c5b3aa,0x0fa3e96,0x06eeef9,0x2b9fdd9,
        0x26fca39,0x0134fe1,0x22661ab,0x1990416,0x03945d6,0x15e3628,
        0x3848ca3,0x0f91e46,0x00b08cd },
      { 0x16d2411,0x3717e1d,0x128c45e,0x3669d54,0x0d4a790,0x2797da8,
        0x0f09634,0x2faab0b,0x27df649,0x3b19b49,0x0467039,0x39b65a2,
        0x3816f3c,0x31ad0bd,0x0050046 } },
    /* 49 */
    { { 0x2425043,0x3858099,0x389092a,0x3f7c236,0x11ff66a,0x3c58b39,
        0x2f5a7f8,0x1663ce1,0x2a0fcf5,0x38634b7,0x1a8ca18,0x0dcace8,
        0x0e6f778,0x03ae334,0x00df0d2 },
      { 0x1bb4045,0x357875d,0x14b77ed,0x33ae5b6,0x2252a47,0x31899dd,
        0x3293582,0x040c6f6,0x14340dd,0x3614f0e,0x3d5f47f,0x326fb3d,
        0x0044a9d,0x00beeb9,0x0027c23 } },
    /* 50 */
    { { 0x32d49ce,0x34822a3,0x30a22d1,0x00858b7,0x10d91aa,0x2681fd9,
        0x1cce870,0x2404a71,0x38b8433,0x377c1c8,0x019442c,0x0a38b21,
        0x22aba50,0x0d61c81,0x002dcbd },
      { 0x0680967,0x2f0f2f9,0x172cb5f,0x1167e4b,0x12a7bc6,0x05b0da7,
        0x2c76e11,0x3a36201,0x37a3177,0x1d71419,0x0569df5,0x0dce7ad,
        0x3f40b75,0x3bd8db0,0x002d481 } },
    /* 51 */
    { { 0x2a1103e,0x34e7f7f,0x1b171a2,0x24a57e0,0x2eaae55,0x166c992,
        0x10aa18f,0x0bb836f,0x01acb59,0x0e430e7,0x1750cca,0x18be036,
        0x3cc6cdf,0x0a0f7e5,0x00da4d8 },
      { 0x2201067,0x374d187,0x1f6b0a6,0x165a7ec,0x31531f8,0x3580487,
        0x15e5521,0x0724522,0x2b04c04,0x202c86a,0x3cc1ccf,0x225b11a,
        0x1bde79d,0x0eccc50,0x00d24da } },
    /* 52 */
    { { 0x3b0a354,0x2814dd4,0x1cd8575,0x3d031b7,0x0392ff2,0x1855ee5,
        0x0e8cff5,0x203442e,0x3bd3b1b,0x141cf95,0x3fedee1,0x1d783c0,
        0x26f192a,0x0392aa3,0x0075238 },
      { 0x158ffe9,0x3889f19,0x14151f4,0x06067b1,0x13a3486,0x1e65c21,
        0x382d5ef,0x1ab0aac,0x2ffddc4,0x3179b7a,0x3c8d094,0x05101e3,
        0x237c6e5,0x3947d83,0x00f674f } },
    /* 53 */
    { { 0x363408f,0x21eb96b,0x27376fb,0x2a735d6,0x1a39c36,0x3d31863,
        0x33313fc,0x32235e0,0x082f034,0x23ef351,0x39b3528,0x1a69d84,
        0x1d9c944,0x07159ad,0x0077a71 },
      { 0x04f8d65,0x25771e5,0x2ba84a6,0x194586a,0x1e6da5f,0x118059a,
        0x14e9c32,0x1d24619,0x3f528ae,0x22f22e4,0x0f5580d,0x0747a0e,
        0x32cc85f,0x286b3a8,0x008ccf9 } },
    /* 54 */
    { { 0x196fee2,0x2c4431c,0x094528a,0x18e1d32,0x175799d,0x26bb6b7,
        0x2293482,0x23fd289,0x07b2be8,0x1a5c533,0x158d60d,0x04a4f3f,
        0x164e9f7,0x32ccca9,0x00da6b6 },
      { 0x1d821c2,0x3f76c4f,0x323df43,0x17e4374,0x0f2f278,0x121227e,
        0x2464190,0x19d2644,0x326d24c,0x3185983,0x0803c15,0x0767a33,
        0x1c4c996,0x0563eab,0x00631c6 } },
    /* 55 */
    { { 0x1752366,0x0baf83f,0x288bacf,0x0384e6f,0x2b93c34,0x3c805e7,
        0x3664850,0x29e1663,0x254ff1d,0x3852080,0x0f85c16,0x1e389d9,
        0x3191352,0x3915eaa,0x00a246e },
      { 0x3763b33,0x187ad14,0x3c0d438,0x3f11702,0x1c49f03,0x35ac7a8,
        0x3f16bca,0x27266bf,0x08b6fd4,0x0f38ce4,0x37fde8c,0x147a6ff,
        0x02c5e5c,0x28e7fc5,0x00076a7 } },
    /* 56 */
    { { 0x2338d10,0x0e77fa7,0x011b046,0x1bfd0ad,0x28ee699,0x21d73bc,
        0x0461d1a,0x342ea58,0x2d695b4,0x30415ed,0x2906e0b,0x18e494a,
        0x20f8a27,0x026b870,0x002c19f },
      { 0x2f4c43d,0x3f0fc3b,0x0aa95b8,0x2a01ea1,0x3e2e1b1,0x0d74af6,
        0x0555288,0x0cb757d,0x24d2071,0x143d2bb,0x3907f67,0x3e0ce30,
        0x131f0e9,0x3724381,0x007a874 } },
    /* 57 */
    { { 0x3c27050,0x08b5165,0x0bf884b,0x3dd679c,0x3bd0b8d,0x25ce2e6,
        0x1674057,0x1f13ed3,0x1f5cd91,0x0d1fd35,0x13ce6e3,0x2671338,
        0x10f8b90,0x34e5487,0x00942bf },
      { 0x03b566d,0x23c3da9,0x37de502,0x1a486ff,0x1af6e86,0x1108cb3,
        0x36f856c,0x01a6a0f,0x179f915,0x1595a01,0x2cfecb8,0x082568b,
        0x1ba16d1,0x1abb6c0,0x00cf7f0 } },
    /* 58 */
    { { 0x2f96c80,0x1b8f123,0x209c0f5,0x2ccf76d,0x1d521f2,0x3705143,
        0x2941027,0x07f88af,0x07102a9,0x38b4868,0x1efa37d,0x1bdd3e8,
        0x028a12e,0x02e055b,0x009a9a9 },
      { 0x1c7dfcb,0x3aa7aa7,0x1d62c54,0x3f0b0b0,0x3c74e66,0x274f819,
        0x23f9674,0x0e2b67c,0x24654dd,0x0c71f0e,0x1946cee,0x0016211,
        0x0045dc7,0x0da1173,0x0089856 } },
    /* 59 */
    { { 0x0e73946,0x29f353f,0x056329d,0x2d48c5a,0x28f697d,0x2ea4bb1,
        0x235e9cc,0x34faa38,0x15f9f91,0x3557519,0x2a50a6c,0x1a27c8e,
        0x2a1a0f3,0x3098879,0x00dcf21 },
      { 0x1b818bf,0x2f20b98,0x2243cff,0x25b691e,0x3c74a2f,0x2f06833,
        0x0e980a8,0x32db48d,0x2b57929,0x33cd7f5,0x2fe17d6,0x11a384b,
        0x2dafb81,0x2b9562c,0x00ddea6 } },
    /* 60 */
    { { 0x2787b2e,0x37a21df,0x310d294,0x07ce6a4,0x1258acc,0x3050997,
        0x19714aa,0x122824b,0x11c708b,0x0462d56,0x21abbf7,0x331aec3,
        0x307b927,0x3e8d5a0,0x00c0581 },
      { 0x24d4d58,0x3d628fc,0x23279e0,0x2e38338,0x2febe9b,0x346f9c0,
        0x3d6a419,0x3264e47,0x245faca,0x3669f62,0x1e50d66,0x3028232,
        0x18201ab,0x0bdc192,0x0002c34 } },
    /* 61 */
    { { 0x17bdbc2,0x1c501c5,0x1605ccd,0x31ab438,0x372fa89,0x24a8057,
        0x13da2bb,0x3f95ac7,0x3cda0a3,0x1e2b679,0x24f0673,0x03b72f4,
        0x35be616,0x2ccd849,0x0079d4d },
      { 0x33497c4,0x0c7f657,0x2fb0d3d,0x3b81064,0x38cafea,0x0e942bc,
        0x3ca7451,0x2ab9784,0x1678c85,0x3c62098,0x1eb556f,0x01b3aa2,
        0x149f3ce,0x2656f6d,0x002eef1 } },
    /* 62 */
    { { 0x0596edc,0x1f4fad4,0x03a28ed,0x18a4149,0x3aa3593,0x12db40a,
        0x12c2c2a,0x3b1a288,0x327c4fb,0x35847f5,0x384f733,0x02e3fde,
        0x1af0e8a,0x2e417c3,0x00d85a6 },
      { 0x0091cf7,0x2267d75,0x276860e,0x19cbbfc,0x04fef2b,0x030ce59,
        0x3195cb1,0x1aa3f07,0x3699362,0x2a09d74,0x0d6c840,0x1e413d0,
        0x28acdc7,0x1ff5ea1,0x0088d8b } },
    /* 63 */
    { { 0x3d98425,0x08dc8de,0x154e85f,0x24b1c2c,0x2d44639,0x19a1e8b,
        0x300ee29,0x053f72e,0x3f7c832,0x12417f6,0x1359368,0x0674a4c,
        0x1218e20,0x0e4fbd4,0x000428c },
      { 0x01e909a,0x1d88fe6,0x12da40c,0x215ef86,0x2925133,0x004241f,
        0x3e480f4,0x2d16523,0x07c3120,0x3375e86,0x21fd8f3,0x35dc0b6,
        0x0efc5c9,0x14ef8d6,0x0066e47 } },
    /* 64 */
    { { 0x2973cf4,0x34d3845,0x34f7070,0x22df93c,0x120aee0,0x3ae2b4a,
        0x1af9b95,0x177689a,0x036a6a4,0x0377828,0x23df41e,0x22d4a39,
        0x0df2aa1,0x06ca898,0x0003cc7 },
      { 0x06b1dd7,0x19dc2a8,0x35d324a,0x0467499,0x25bfa9c,0x1a1110c,
        0x01e2a19,0x1b3c1cf,0x18d131a,0x10d9815,0x2ee7945,0x0a2720c,
        0x0ddcdb0,0x2c071b6,0x00a6aef } },
    /* 65 */
    { { 0x1ab5245,0x1192d00,0x13ffba1,0x1b71236,0x09b8d0b,0x0eb49cb,
        0x1867dc9,0x371de4e,0x05eae9f,0x36faf82,0x094ea8b,0x2b9440e,
        0x022e173,0x2268e6b,0x00740fc },
      { 0x0e23b23,0x22c28ca,0x04d05e2,0x0bb84c4,0x1235272,0x0289903,
        0x267a18b,0x0df0fd1,0x32e49bb,0x2ab1d29,0x281e183,0x3dcd3c3,
        0x1c0eb79,0x2db0ff6,0x00bffe5 } },
    /* 66 */
    { { 0x2a2123f,0x0d63d71,0x1f6db1a,0x257f8a3,0x1927b2d,0x06674be,
        0x302753f,0x20b7225,0x14c1a3f,0x0429cdd,0x377affe,0x0f40a75,
        0x2d34d06,0x05fb6b9,0x0054398 },
      { 0x38b83c4,0x1e7bbda,0x1682f79,0x0527651,0x2615cb2,0x1795fab,
        0x0e4facc,0x11f763c,0x1b81130,0x2010ae2,0x13f3650,0x20d5b72,
        0x1f32f88,0x34617f4,0x00bf008 } },
    /* 67 */
    { { 0x28068db,0x0aa8913,0x1a47801,0x10695ca,0x1c72cc6,0x0fc1a47,
        0x33df2c4,0x0517cf0,0x3471d92,0x1be815c,0x397f794,0x3f03cbe,
        0x121bfae,0x172cbe0,0x00813d7 },
      { 0x383bba6,0x04f1c90,0x0b3f056,0x1c29089,0x2a924ce,0x3c85e69,
        0x1cecbe5,0x0ad8796,0x0aa79f6,0x25e38ba,0x13ad807,0x30b30ed,
        0x0fa963a,0x35c763d,0x0055518 } },
    /* 68 */
    { { 0x0623f3b,0x3ca4880,0x2bff03c,0x0457ca7,0x3095c71,0x02a9a08,
        0x1722478,0x302c10b,0x3a17458,0x001131e,0x0959ec2,0x18bdfbc,
        0x2929fca,0x2adfe32,0x0040ae2 },
      { 0x127b102,0x14ddeaa,0x1771b8c,0x283700c,0x2398a86,0x085a901,
        0x108f9dc,0x0cc0012,0x33a918d,0x26d08e9,0x20b9473,0x12c3fc7,
        0x1f69763,0x1c94b5a,0x00e29de } },
    /* 69 */
    { { 0x035af04,0x3450021,0x12da744,0x077fb06,0x25f255b,0x0db7150,
        0x17dc123,0x1a2a07c,0x2a7636a,0x3972430,0x3704ca1,0x0327add,
        0x3d65a96,0x3c79bec,0x009de8c },
      { 0x11d3d06,0x3fb8354,0x12c7c60,0x04fe7ad,0x0466e23,0x01ac245,
        0x3c0f5f2,0x2a935d0,0x3ac2191,0x090bd56,0x3febdbc,0x3f1f23f,
        0x0ed1cce,0x02079ba,0x00d4fa6 } },
    /* 70 */
    { { 0x0ab9645,0x10174ec,0x3711b5e,0x26357c7,0x2aeec7f,0x2170a9b,
        0x1423115,0x1a5122b,0x39e512c,0x18116b2,0x290db1c,0x041b13a,
        0x26563ae,0x0f56263,0x00b89f3 },
      { 0x3ed2ce4,0x01f365f,0x1b2043b,0x05f7605,0x1f9934e,0x2a068d2,
        0x38d4d50,0x201859d,0x2de5291,0x0a7985a,0x17e6711,0x01b6c1b,
        0x08091fa,0x33c6212,0x001da23 } },
    /* 71 */
    { { 0x2f2c4b5,0x311acd0,0x1e47821,0x3bd9816,0x1931513,0x1bd4334,
        0x30ae436,0x2c49dc0,0x2c943e7,0x010ed4d,0x1fca536,0x189633d,
        0x17abf00,0x39e5ad5,0x00e4e3e },
      { 0x0c8b22f,0x2ce4009,0x1054bb6,0x307f2fc,0x32eb5e2,0x19d24ab,
        0x3b18c95,0x0e55e4d,0x2e4acf5,0x1bc250c,0x1dbf3a5,0x17d6a74,
        0x087cf58,0x07f6f82,0x00f8675 } },
    /* 72 */
    { { 0x110e0b2,0x0e672e7,0x11b7157,0x1598371,0x01c0d59,0x3d60c24,
        0x096b8a1,0x0121075,0x0268859,0x219962f,0x03213f2,0x3022adc,
        0x18de488,0x3dcdeb9,0x008d2e0 },
      { 0x06cfee6,0x26f2552,0x3c579b7,0x31fa796,0x2036a26,0x362ba5e,
        0x103601c,0x012506b,0x387ff3a,0x101a41f,0x2c7eb58,0x23d2efc,
        0x10a5a07,0x2fd5fa3,0x00e3731 } },
    /* 73 */
    { { 0x1cd0abe,0x08a0af8,0x2fa272f,0x17a1fbf,0x1d4f901,0x30e0d2f,
        0x1898066,0x273b674,0x0c1b8a2,0x3272337,0x3ee82eb,0x006e7d3,
        0x2a75606,0x0af1c81,0x0037105 },
      { 0x2f32562,0x2842491,0x1bb476f,0x1305cd4,0x1daad53,0x0d8daed,
        0x164c37b,0x138030f,0x05145d5,0x300e2a3,0x32c09e7,0x0798600,
        0x3515130,0x2b9e55c,0x009764e } },
    /* 74 */
    { { 0x3d5256a,0x06c67f2,0x3a3b879,0x3c9b284,0x04007e0,0x33c1a41,
        0x3794604,0x1d6240e,0x022b6c1,0x22c62a7,0x01d4590,0x32df5f6,
        0x368f1a1,0x2a7486e,0x006e13f },
      { 0x31e6e16,0x20f18a9,0x09ed471,0x23b861d,0x15cf0ef,0x397b502,
        0x1c7f9b2,0x05f84b2,0x2cce6e1,0x3c10bba,0x13fb5a7,0x1b52058,
        0x1feb1b8,0x03b7279,0x00ea1cf } },
    /* 75 */
    { { 0x2a4cc9b,0x15cf273,0x08f36e6,0x076bf3b,0x2541796,0x10e2dbd,
        0x0bf02aa,0x3aa2201,0x03cdcd4,0x3ee252c,0x3799571,0x3e01fa4,
        0x156e8d0,0x1fd6188,0x003466a },
      { 0x2515664,0x166b355,0x2b0b51e,0x0f28f17,0x355b0f9,0x2909e76,
        0x206b026,0x3823a12,0x179c5fa,0x0972141,0x2663a1a,0x01ee36e,
        0x3fc8dcf,0x2ef3d1b,0x0049a36 } },
    /* 76 */
    { { 0x2d93106,0x3d6b311,0x3c9ce47,0x382aa25,0x265b7ad,0x0b5f92f,
        0x0f4c941,0x32aa4df,0x380d4b2,0x0e8aba6,0x260357a,0x1f38273,
        0x0d5f95e,0x199f23b,0x0029f77 },
      { 0x0a0b1c5,0x21a3d6a,0x0ad8df6,0x33d8a5e,0x1240858,0x30000a8,
        0x3ac101d,0x2a8143d,0x1d7ffe9,0x1c74a2a,0x1b962c9,0x1261359,
        0x0c8b274,0x002cf4a,0x00a8a7c } },
    /* 77 */
    { { 0x211a338,0x22a14ab,0x16e77c5,0x3c746be,0x3a78613,0x0d5731c,
        0x1767d25,0x0b799fa,0x009792a,0x09ae8dc,0x124386b,0x183d860,
        0x176747d,0x14c4445,0x00ab09b },
      { 0x0eb9dd0,0x0121066,0x032895a,0x330541c,0x1e6c17a,0x2271b92,
        0x06da454,0x054c2bf,0x20abb21,0x0ead169,0x3d7ea93,0x2359649,
        0x242c6c5,0x3194255,0x00a3ef3 } },
    /* 78 */
    { { 0x3010879,0x1083a77,0x217989d,0x174e55d,0x29d2525,0x0e544ed,
        0x1efd50e,0x30c4e73,0x05bd5d1,0x0793bf9,0x3f7af77,0x052779c,
        0x2b06bc0,0x13d0d02,0x0055a6b },
      { 0x3eaf771,0x094947a,0x0288f13,0x0a21e35,0x22ab441,0x23816bf,
        0x15832e1,0x2d8aff3,0x348cc1f,0x2bbd4a8,0x01c4792,0x34209d3,
        0x06dc72b,0x211a1df,0x00345c5 } },
    /* 79 */
    { { 0x2a65e90,0x173ac2f,0x199cde1,0x0ac905b,0x00987f7,0x3618f7b,
        0x1b578df,0x0d5e113,0x34bac6a,0x27d85ed,0x1b48e99,0x18af5eb,
        0x1a1be9e,0x3987aac,0x00877ca },
      { 0x2358610,0x3776a8e,0x2b0723a,0x344c978,0x22fc4d6,0x1615d53,
        0x3198f51,0x2d61225,0x12cb392,0x07dd061,0x355f7de,0x09e0132,
        0x0efae99,0x13b46aa,0x00e9e6c } },
    /* 80 */
    { { 0x0683186,0x36d8e66,0x0ea9867,0x0937731,0x1fb5cf4,0x13c39ef,
        0x1a7ffed,0x27dfb32,0x31c7a77,0x09f15fd,0x16b25ef,0x1dd01e7,
        0x0168090,0x240ed02,0x0090eae },
      { 0x2e1fceb,0x2ab9783,0x1a1fdf2,0x093a1b0,0x33ff1da,0x2864fb7,
        0x3587d6c,0x275aa03,0x123dc9b,0x0e95a55,0x0592030,0x2102402,
        0x1bdef7b,0x37f2e9b,0x001efa4 } },
    /* 81 */
    { { 0x0540015,0x20e3e78,0x37dcfbd,0x11b0e41,0x02c3239,0x3586449,
        0x1fb9e6a,0x0baa22c,0x00c0ca6,0x3e58491,0x2dbe00f,0x366d4b0,
        0x176439a,0x2a86b86,0x00f52ab },
      { 0x0ac32ad,0x226250b,0x0f91d0e,0x1098aa6,0x3dfb79e,0x1dbd572,
        0x052ecf2,0x0f84995,0x0d27ad2,0x036c6b0,0x1e4986f,0x2317dab,
        0x2327df6,0x0dee0b3,0x00389ac } },
    /* 82 */
    { { 0x0e60f5b,0x0622d3e,0x2ada511,0x05522a8,0x27fe670,0x206af28,
        0x333cb83,0x3f25f6c,0x19ddaf3,0x0ec579b,0x36aabc0,0x093dbac,
        0x348b44b,0x277dca9,0x00c5978 },
      { 0x1cf5279,0x32e294a,0x1a6c26f,0x3f006b6,0x37a3c6b,0x2e2eb26,
        0x2cf88d4,0x3410619,0x1899c80,0x23d3226,0x30add14,0x2810905,
        0x01a41f0,0x11e5176,0x005a02f } },
    /* 83 */
    { { 0x1c90202,0x321df30,0x3570fa5,0x103e2b1,0x3d099d4,0x05e207d,
        0x0a5b1bd,0x0075d0a,0x3db5b25,0x2d87899,0x32e4465,0x226fc13,
        0x24cb8f8,0x3821daa,0x004da3a },
      { 0x3e66861,0x03f89b8,0x386d3ef,0x14ccc62,0x35e7729,0x11ce5b7,
        0x035fbc7,0x3f4df0f,0x29c439f,0x1144568,0x32d7037,0x312f65e,
        0x06b9dbf,0x03a9589,0x0008863 } },
    /* 84 */
    { { 0x0a9e8c9,0x1a19b6e,0x091ecd9,0x2e16ee0,0x2a11963,0x116cf34,
        0x390d530,0x194131f,0x2b580f3,0x31d569c,0x21d3751,0x3e2ce64,
        0x193de46,0x32454f0,0x004bffd },
      { 0x09554e7,0x170126e,0x2be6cd1,0x153de89,0x0353c67,0x350765c,
        0x202370b,0x1db01e5,0x30b12b1,0x3778591,0x00c8809,0x2e845d5,
        0x1fb1e56,0x170f90d,0x00e2db3 } },
    /* 85 */
    { { 0x328e33f,0x392aad8,0x36d1d71,0x0aebe04,0x1548678,0x1b55c8c,
        0x24995f8,0x2a5a01e,0x1bd1651,0x37c7c29,0x36803b6,0x3716c91,
        0x1a935a5,0x32f10b7,0x005c587 },
      { 0x2e8b4c0,0x336ccae,0x11382b6,0x22ec4cc,0x066d159,0x35fa585,
        0x23b2d25,0x3017528,0x2a674a8,0x3a4f900,0x1a7ce82,0x2b2539b,
        0x3d46545,0x0a07918,0x00eb9f8 } },
    /* 86 */
    { { 0x2cf5b9b,0x03e747f,0x166a34e,0x0afc81a,0x0a115b1,0x3aa814d,
        0x11cf3b1,0x163e556,0x3cbfb15,0x157c0a4,0x1bc703a,0x2141e90,
        0x01f811c,0x207218b,0x0092e6b },
      { 0x1af24e3,0x3af19b3,0x3c70cc9,0x335cbf3,0x068917e,0x055ee92,
        0x09a9308,0x2cac9b7,0x008b06a,0x1175097,0x36e929c,0x0be339c,
        0x0932436,0x15f18ba,0x0009f6f } },
    /* 87 */
    { { 0x29375fb,0x35ade34,0x11571c7,0x07b8d74,0x3fabd85,0x090fa91,
        0x362dcd4,0x02c3fdb,0x0608fe3,0x2477649,0x3fc6e70,0x059b7eb,
        0x1e6a708,0x1a4c220,0x00c6c4c },
      { 0x2a53fb0,0x1a3e1f5,0x11f9203,0x27e7ad3,0x038718e,0x3f5f9e4,
        0x308acda,0x0a8700f,0x34472fe,0x3420d7a,0x08076e5,0x014240e,
        0x0e7317e,0x197a98e,0x00538f7 } },
    /* 88 */
    { { 0x2663b4b,0x0927670,0x38dd0e0,0x16d1f34,0x3e700ab,0x3119567,
        0x12559d2,0x399b6c6,0x0a84bcd,0x163e7dd,0x3e2aced,0x058548c,
        0x03a5bad,0x011cf74,0x00c155c },
      { 0x3e454eb,0x2a1e64e,0x1ccd346,0x36e0edf,0x266ee94,0x2e74aaf,
        0x2d8378a,0x3cd547d,0x1d27733,0x0928e5b,0x353553c,0x26f502b,
        0x1d94341,0x2635cc7,0x00d0ead } },
    /* 89 */
    { { 0x0142408,0x382c3bb,0x3310908,0x2e50452,0x398943c,0x1d0ac75,
        0x1bf7d81,0x04bd00f,0x36b6934,0x3349c37,0x0f69e20,0x0195252,
        0x243a1c5,0x030da5f,0x00a76a9 },
      { 0x224825a,0x28ce111,0x34c2e0f,0x02e2b30,0x382e48c,0x26853ca,
        0x24bd14e,0x0200dec,0x1e24db3,0x0d3d775,0x132da0a,0x1dea79e,
        0x253dc0c,0x03c9d31,0x0020db9 } },
    /* 90 */
    { { 0x26c5fd9,0x05e6dc3,0x2eea261,0x08db260,0x2f8bec1,0x1255edf,
        0x283338d,0x3d9a91d,0x2640a72,0x03311f9,0x1bad935,0x152fda8,
        0x0e95abd,0x31abd15,0x00dfbf4 },
      { 0x107f4fa,0x29ebe9a,0x27353f7,0x3821972,0x27311fa,0x2925ab6,
        0x337ab82,0x2de6c91,0x1f115fe,0x044f909,0x21b93c2,0x3a5f142,
        0x13eb5e9,0x3ab1377,0x00b26b6 } },
    /* 91 */
    { { 0x22e5f2b,0x2ae7d4a,0x1ac481c,0x0a6fce1,0x2f93caf,0x242658e,
        0x3f35c3c,0x050f3d2,0x30074c9,0x142079c,0x0281b4c,0x295fea3,
        0x007413e,0x01726cd,0x00e4979 },
      { 0x1ab3cfb,0x1b76295,0x36adf55,0x1ad4636,0x1d444b9,0x3bd2e55,
        0x35425a5,0x1aa8cd3,0x3acecd2,0x1f769e8,0x1a655e9,0x1f6846f,
        0x24c70b5,0x3bff080,0x0002da3 } },
    /* 92 */
    { { 0x081d0d9,0x2c00d99,0x1fe2e24,0x396063f,0x03740db,0x243f680,
        0x3c1f451,0x1ff7b07,0x2803cf2,0x38ca724,0x2934f43,0x0d72d4d,
        0x0e8fe74,0x2975e21,0x002b505 },
      { 0x11adcc9,0x331a99c,0x21e16cf,0x1714c78,0x1f03432,0x2caa2a6,
        0x34a9679,0x2f7fe8b,0x0423c21,0x1a757ce,0x31b57d6,0x171e044,
        0x093b9b2,0x13602e0,0x00db534 } },
    /* 93 */
    { { 0x250a2f5,0x0b999eb,0x21d10d7,0x22b92a1,0x39b7f8d,0x0c37c72,
        0x29f70f3,0x3bf0e84,0x1d7e04f,0x07a42a9,0x272c3ae,0x1587b2f,
        0x155faff,0x10a336e,0x000d8fb },
      { 0x3663784,0x0d7dcf5,0x056ad22,0x319f8b1,0x0c05bae,0x2b6ff33,
        0x0292e42,0x0435797,0x188efb1,0x0d3f45e,0x119d49f,0x395dcd3,
        0x279fe27,0x133a13d,0x00188ac } },
    /* 94 */
    { { 0x396c53e,0x0d133e9,0x009b7ee,0x13421a0,0x1bbf607,0x1d284a5,
        0x1594f74,0x18cb47c,0x2dcac11,0x2999ddb,0x04e2fa5,0x1889e2c,
        0x0a89a18,0x33cb215,0x0052665 },
      { 0x104ab58,0x1d91920,0x3d6d7e3,0x04dc813,0x1167759,0x13a8466,
        0x0a06a54,0x103761b,0x25b1c92,0x26a8fdd,0x2474614,0x21406a4,
        0x251d75f,0x38c3734,0x007b982 } },
    /* 95 */
    { { 0x15f3060,0x3a7bf30,0x3be6e44,0x0baa1fa,0x05ad62f,0x1e54035,
        0x099d41c,0x2a744d9,0x1c0336f,0x3e99b5b,0x1afd3b1,0x2bf1255,
        0x1822bf8,0x2c93972,0x001d8cc },
      { 0x1d7584b,0x0508ade,0x20dd403,0x203a8fc,0x1c54a05,0x1611a31,
        0x037c8f9,0x1dcd4fe,0x110fbea,0x30f60bc,0x3dffe2f,0x26a1de1,
        0x0480367,0x18ec81c,0x0048eba } },
    /* 96 */
    { { 0x346e2f6,0x0435077,0x036789b,0x3e06545,0x313ab57,0x351a721,
        0x3372b91,0x15e6019,0x2fa4f6c,0x3c30656,0x272c9ac,0x10e84a8,
        0x2bdacea,0x232d9e2,0x009dadd },
      { 0x182579a,0x15b1af8,0x02d8cce,0x36cb49b,0x086feba,0x2911d17,
        0x268ee12,0x011e871,0x18698dc,0x35602b3,0x11b9ec2,0x0ade731,
        0x0f6a05a,0x1821015,0x00007da } },
    /* 97 */
    { { 0x3b00dd0,0x328d485,0x27a69e3,0x32c3a06,0x1046779,0x120b61c,
        0x19fef3d,0x0fef2e6,0x134d923,0x039bce0,0x348cd0e,0x0b0c007,
        0x066ae11,0x15d8f1b,0x00934e7 },
      { 0x33234dc,0x353f0f5,0x2fc1b44,0x18a193a,0x2fcae20,0x1afbc86,
        0x3afe252,0x17f7e10,0x107f3b7,0x2d84d54,0x394c2e6,0x19e96a9,
        0x0a37283,0x26c6152,0x003d262 } },
    /* 98 */
    { { 0x37cfaf8,0x01863d0,0x0299623,0x32c80cb,0x25b8742,0x0a4d90e,
        0x1f72472,0x13de652,0x31a0946,0x0ee0103,0x0f25414,0x2518b49,
        0x07e7604,0x1488d9b,0x00abd6b },
      { 0x1338f55,0x2ce4af5,0x1a0c119,0x3380525,0x21a80a9,0x235d4df,
        0x118ca7f,0x2dd8bcc,0x1c26bf4,0x32dc56b,0x28482b6,0x1418596,
        0x3c84d24,0x1f1a5a9,0x00d958d } },
    /* 99 */
    { { 0x1c21f31,0x22aa1ef,0x258c9ad,0x2d2018f,0x0adb3ca,0x01f75ee,
        0x186283b,0x31ad3bf,0x3621be7,0x3b1ee6d,0x015582d,0x3d61d04,
        0x2ddf32e,0x14b8a66,0x00c970c },
      { 0x2f24d66,0x00b8a88,0x100a78f,0x041d330,0x2efec1d,0x24c5b86,
        0x2a6a390,0x37526bc,0x2055849,0x3339f08,0x16bffc4,0x07f9d72,
        0x06ec09c,0x3f49ee8,0x00cad98 } },
    /* 100 */
    { { 0x248b73e,0x1b8b42d,0x285eed7,0x39473f4,0x1a9f92c,0x3b44f78,
        0x086c062,0x06a4ea3,0x34ea519,0x3c74e95,0x1ad1b8b,0x1737e2c,
        0x2cfe338,0x0a291f4,0x00bbecc },
      { 0x1cec548,0x0c9b01a,0x20b298d,0x377c902,0x24f5bc1,0x2415c8d,
        0x1a70622,0x2529090,0x1c5c682,0x283f1ba,0x2319f17,0x0120e2e,
        0x01c6f4d,0x33c67ff,0x008b612 } },
    /* 101 */
    { { 0x03830eb,0x02d4053,0x10c59bb,0x0f23b83,0x13d08f8,0x26ea4e2,
        0x2626427,0x0a45292,0x0449cbc,0x0175750,0x074c46f,0x27ae0f8,
        0x2d7d6ae,0x163dd3a,0x0063bb7 },
      { 0x2bb29e0,0x034bab1,0x341e1c4,0x21d2c0b,0x295aa2d,0x0f2c666,
        0x1891755,0x13db64a,0x2fe5158,0x337646e,0x31a1aae,0x057bee4,
        0x00f9e37,0x396d19e,0x00c1b6a } },
    /* 102 */
    { { 0x2772f41,0x34f92d0,0x39d1cde,0x174ef2d,0x03a700d,0x03fbb98,
        0x30d50e8,0x352ed10,0x1fcf5e5,0x3d113bc,0x26e358f,0x180653f,
        0x1b43cc6,0x3cc9aa4,0x00e68a2 },
      { 0x37fe4d2,0x09dd725,0x01eb584,0x171f8a9,0x278fdef,0x3e37c03,
        0x3bec02f,0x149757c,0x0cd5852,0x37d2e10,0x0e6988b,0x1c120e9,
        0x0b83708,0x38e7319,0x0039499 } },
    /* 103 */
    { { 0x08df5fe,0x177a02c,0x0362fc0,0x1f18ee8,0x00c1295,0x173c50a,
        0x379414d,0x1885ba8,0x32a54ef,0x2315644,0x39e65cf,0x357c4be,
        0x1d66333,0x09e05a5,0x0009c60 },
      { 0x1f7a2fb,0x073b518,0x2eb83ac,0x11353d7,0x1dd8384,0x0c63f2b,
        0x238c6c8,0x2a1920a,0x2e5e9f1,0x1cc56f8,0x042daf4,0x1ed5dc5,
        0x25f9e31,0x012a56a,0x0081b59 } },
    /* 104 */
    { { 0x321d232,0x2c71422,0x3a756b6,0x30230b2,0x387f3db,0x3a7c3eb,
        0x274b46a,0x201e69f,0x185bb7b,0x140da82,0x0d974a2,0x0616e42,
        0x35ec94f,0x3bc366b,0x005aa7c },
      { 0x3dcfffc,0x19a9c15,0x3225e05,0x36ae114,0x16ea311,0x0cda2aa,
        0x2a1a8d2,0x154b5cb,0x08348cd,0x17b66c8,0x080ea43,0x21e59f3,
        0x04173b9,0x31d5b04,0x00ad735 } },
    /* 105 */
    { { 0x2e76ef4,0x216acf3,0x2b93aea,0x112bc74,0x3449974,0x2b2e48f,
        0x11929be,0x2f03021,0x19051e3,0x0ac202d,0x19be68a,0x3b87619,
        0x26cdac4,0x086592c,0x00f00de },
      { 0x2e90d4d,0x3ed703c,0x2c648d7,0x29ddf67,0x000e219,0x3471247,
        0x26febd5,0x1161713,0x3541a8f,0x302038d,0x08d2af9,0x26e1b21,
        0x398514a,0x36dad99,0x002ed70 } },
    /* 106 */
    { { 0x06f25cb,0x1104596,0x370faee,0x07e83f3,0x0f7b686,0x228d43a,
        0x12cd201,0x0a1bd57,0x3e592dc,0x1e186fc,0x2226aba,0x2c63fe9,
        0x17b039a,0x1efaa61,0x00d1582 },
      { 0x2e6acef,0x07d51e4,0x3ac326c,0x322b07e,0x1422c63,0x32ff5c7,
        0x18760df,0x048928b,0x139b251,0x04d7da9,0x048d1a2,0x2a23e84,
        0x199dbba,0x2fa7afe,0x0049f1a } },
    /* 107 */
    { { 0x3492b73,0x27d3d3d,0x2b1a16f,0x07b2ce4,0x0cf28ec,0x2729bff,
        0x3130d46,0x3e96116,0x140b72e,0x14a2ea3,0x1ca066f,0x3a61f1d,
        0x022ebac,0x09192b4,0x003e399 },
      { 0x12555bb,0x0b6139d,0x239463a,0x12a70ab,0x2aaa93b,0x2254e72,
        0x00424ec,0x26a6736,0x26daa11,0x25b5ad6,0x379f262,0x140cd30,
        0x0c7d3bd,0x097bbcf,0x00899e9 } },
    /* 108 */
    { { 0x3825dc4,0x3cd946f,0x0462b7f,0x31102e7,0x30f741c,0x3313ed6,
        0x1ff5a95,0x15bf9dc,0x09b47fd,0x0f2e7a7,0x1626c0d,0x3c14f6d,
        0x14098bd,0x19d7df8,0x00a97ce },
      { 0x0934f5e,0x3f968db,0x046f68a,0x12333bf,0x26cd5e1,0x1ea2161,
        0x358570d,0x235031d,0x35edd55,0x05265e3,0x24ae00c,0x3542229,
        0x25bb2a1,0x1c83c75,0x0058f2a } },
    /* 109 */
    { { 0x24daedb,0x376928f,0x305266f,0x0499746,0x038318c,0x312efd7,
        0x1910a24,0x33450a3,0x1c478a9,0x39d8bf9,0x12cc0ae,0x397aeab,
        0x0654c08,0x095f283,0x00d2cdf },
      { 0x0b717d2,0x1f162c2,0x107a48f,0x128e1b3,0x2380718,0x39f4044,
        0x00f626a,0x05ec0c9,0x21bc439,0x200fa4d,0x20aea01,0x186a1d8,
        0x26372f2,0x1a91f87,0x0053f55 } },
    /* 110 */
    { { 0x3512a90,0x33b958b,0x29f1c84,0x0106c3a,0x224b3c0,0x09b307a,
        0x215d2de,0x3bdf43b,0x22cf0c9,0x176121d,0x1534143,0x09ba717,
        0x16b3110,0x0f73f6c,0x008f5b7 },
      { 0x2c75d95,0x26fbcb4,0x0dda1f6,0x206f819,0x28d33d5,0x1fb4d79,
        0x024c125,0x30a0630,0x1f9c309,0x0fe350d,0x1696019,0x0a54187,
        0x09541fd,0x35e3a79,0x0066618 } },
    /* 111 */
    { { 0x0e382de,0x33f5163,0x0dde571,0x3bb7a40,0x1175806,0x12ae8ed,
        0x0499653,0x3b25586,0x38ade7a,0x3fa265d,0x3f4aa97,0x3c03dbb,
        0x30c6de8,0x32d4042,0x00ae971 },
      { 0x2f788f1,0x1fbaf0e,0x3e2d182,0x3ff904f,0x0d46229,0x1d0726d,
        0x15455b4,0x093ae28,0x290f8e4,0x097c0b9,0x1ae8771,0x28480bb,
        0x04f6d40,0x3689925,0x0049b3b } },
    /* 112 */
    { { 0x35b2d69,0x31819c0,0x11b0d63,0x035afb6,0x2b50715,0x2bece6c,
        0x35f82f7,0x0ad987c,0x0011601,0x02e6f67,0x2d0a5f5,0x365e583,
        0x2f7c900,0x11449c5,0x00ed705 },
      { 0x27abdb4,0x1bbfd04,0x301c157,0x263c079,0x36850d6,0x3f21f8b,
        0x27d7493,0x0f9227e,0x06fb0ce,0x002daf3,0x37d8c1c,0x3ef87d7,
        0x19cc6f4,0x0c3809c,0x00cf752 } },
    /* 113 */
    { { 0x22d94ed,0x075b09c,0x020e676,0x084dc62,0x2d1ec3f,0x17439f1,
        0x240b702,0x33cc596,0x30ebaf3,0x0359fe0,0x393ea43,0x0ece01e,
        0x16c6963,0x03a82f2,0x0017faa },
      { 0x3866b98,0x3cd20b7,0x12d4e6b,0x3a6a76d,0x1205c1e,0x3e6ae1a,
        0x2f9bbdf,0x2e61547,0x2d175ee,0x28e18f6,0x13cf442,0x085b0ef,
        0x0e321ef,0x238fe72,0x003fb22 } },
    /* 114 */
    { { 0x360ac07,0x26dc301,0x3f4d94f,0x2ba75e6,0x1f3c9cc,0x17ff20f,
        0x0ea084c,0x30e39cf,0x143dc49,0x03bd43e,0x3c9e733,0x19e8aba,
        0x27fbaf4,0x12d913a,0x005ee53 },
      { 0x3609e7f,0x2d89c80,0x09f020c,0x1558bf7,0x3098443,0x3c515fd,
        0x1c8e580,0x16506bd,0x26cb4b2,0x1747d42,0x2ec8239,0x32c91f0,
        0x1ca3377,0x079768f,0x00a5f3e } },
    /* 115 */
    { { 0x185fa94,0x122759f,0x0e47023,0x0dcb6e7,0x10ba405,0x3b5eab4,
        0x1f7a1fa,0x32d003f,0x1739a4c,0x3295ec3,0x1b18967,0x3f3b265,
        0x34d2448,0x2dbadc9,0x00f30b5 },
      { 0x01c5338,0x2d1dcf2,0x2bd07cc,0x39a8fb5,0x2b85639,0x355bab6,
        0x1df95f1,0x01eb5f6,0x17f0a16,0x1b895b5,0x157574d,0x29fff72,
        0x3a8c46d,0x0118071,0x0065f84 } },
    /* 116 */
    { { 0x3a1e7f1,0x17432f2,0x1f648d4,0x3000ad5,0x2ef0a08,0x1f86624,
        0x1ca31b1,0x241f9dc,0x2cb4885,0x2b8610f,0x364ce16,0x1e5faf0,
        0x0b33867,0x2cb637d,0x00816d2 },
      { 0x1aa8671,0x02c394e,0x35f5e87,0x393040a,0x39f0db3,0x1c831a5,
        0x2966591,0x034a8d0,0x09e613c,0x042b532,0x018ddd6,0x3e402c9,
        0x2e20e1a,0x29cb4cd,0x00e087c } },
    /* 117 */
    { { 0x3a10079,0x20c7fea,0x3ff2222,0x1edb593,0x00dc5f8,0x3a32ccc,
        0x1479073,0x0cfed11,0x2a2702a,0x17a056a,0x1fba321,0x235acb9,
        0x149c833,0x172de7d,0x000f753 },
      { 0x2e95923,0x3b365cb,0x009f471,0x0df1b47,0x21e868b,0x199bbd3,
        0x07b8ecc,0x12ff0af,0x189808a,0x3bd5059,0x3fbc4d2,0x0fa7b88,
        0x1125bf2,0x0db0b5d,0x0043572 } },
    /* 118 */
    { { 0x29cdb1b,0x1db656e,0x391efe1,0x004be09,0x245a1ca,0x3793328,
        0x254af24,0x2f2e65d,0x10e5cc4,0x2af6fe7,0x2d97ac0,0x29f7d42,
        0x19fd6f6,0x0ac184d,0x00c5211 },
      { 0x305eae3,0x36738d3,0x2c2b696,0x00ba50e,0x3903adc,0x2122f85,
        0x0753470,0x1cf96a4,0x1702a39,0x247883c,0x2feb67e,0x2ab3071,
        0x3c6b9e1,0x30cb85a,0x002ca0a } },
    /* 119 */
    { { 0x3871eb5,0x284b93b,0x0a7affe,0x176a2fc,0x294c2f2,0x204d3aa,
        0x1e4c2a7,0x3ec4134,0x2fb0360,0x3847b45,0x05fc11b,0x0a6db6e,
        0x390fa40,0x2adfd34,0x005e9f7 },
      { 0x0646612,0x1b5cbcc,0x10d8507,0x0777687,0x3a0afed,0x1687440,
        0x0222578,0x1af34a4,0x2174e27,0x372d267,0x11246c3,0x34769c5,
        0x2044316,0x1b4d626,0x00c72d5 } },
    /* 120 */
    { { 0x2e5bb45,0x3ff1d36,0x16dcdf5,0x128986f,0x399068c,0x2a63b1e,
        0x0afa7aa,0x3a5b770,0x200f121,0x33b74bb,0x1414045,0x0f31ef8,
        0x2f50e16,0x2f38cd6,0x00b0b1b },
      { 0x1a06293,0x035e140,0x2644d44,0x1f1954b,0x2cdebab,0x31d5f91,
        0x0b8dbc8,0x38f2d23,0x3783cab,0x2a07e73,0x3123f59,0x3409846,
        0x3784ddd,0x223bbac,0x003dc7b } },
    /* 121 */
    { { 0x0741456,0x234e631,0x2121e1b,0x00980ca,0x3a9dfa9,0x098c916,
        0x3fc86d1,0x1c63072,0x3625244,0x13d0471,0x05b0fc5,0x1487550,
        0x2498596,0x11bb6ea,0x001afab },
      { 0x274b4ad,0x240aea1,0x3d12a75,0x2b56b61,0x1486b43,0x1b83426,
        0x31c7363,0x35b59ca,0x207bb6c,0x38e6243,0x19bace4,0x0a26671,
        0x35e3381,0x0c2ded4,0x00d8da4 } },
    /* 122 */
    { { 0x2b75791,0x19590b1,0x2bfb39f,0x2988601,0x0050947,0x0d8bbe1,
        0x23e3701,0x08e4432,0x2ed8c3d,0x326f182,0x332e1dd,0x12219c5,
        0x2e0779b,0x367aa63,0x0012d10 },
      { 0x251b7dc,0x0a08b4d,0x1138b6f,0x2ea02af,0x06345a5,0x1cb4f21,
        0x0332624,0x1d49d88,0x140acc5,0x2f55287,0x024447c,0x291ace9,
        0x1a4966e,0x015cbec,0x005bc41 } },
    /* 123 */
    { { 0x351cd0e,0x315e8e9,0x07d6e70,0x067ae8f,0x2190d84,0x351f556,
        0x03bee79,0x31b62c7,0x266f912,0x1b6a504,0x007a6ad,0x3a6ab31,
        0x3891112,0x3c45ba0,0x00d6ce5 },
      { 0x0e1f2ce,0x32a5edc,0x1434063,0x1ca084f,0x2a3e47c,0x137e042,
        0x16e2418,0x2069280,0x3b0dfd8,0x35a22b5,0x289bf0a,0x1f667f2,
        0x02d23a3,0x0ce688f,0x00d8e3f } },
    /* 124 */
    { { 0x10bed6f,0x14c58dd,0x0b0abdf,0x0ca0f9a,0x3808abc,0x2ec228c,
        0x2366275,0x12afa16,0x20f6b0e,0x37dca8e,0x3af0c6a,0x1c5b467,
        0x1b25ff7,0x00814de,0x0022dcc },
      { 0x1a56e11,0x02fe37e,0x3f21740,0x35d5a91,0x06cb8ba,0x29bad91,
        0x17176f7,0x2d919f2,0x0f7d1f5,0x13a3f61,0x04ddb05,0x0c82a51,
        0x286f598,0x2e8c777,0x0007071 } },
    /* 125 */
    { { 0x0f8fcb9,0x3e83966,0x170c6fd,0x3825343,0x089cec8,0x01b482a,
        0x0993971,0x3327282,0x39aba8a,0x32456fe,0x1507e01,0x1c3252d,
        0x21ffb13,0x29822a0,0x0083246 },
      { 0x23c378f,0x1cea7ef,0x1be9a82,0x224d689,0x37e5447,0x3764a75,
        0x3a49724,0x361e1b3,0x19d365b,0x3a61ffb,0x1c29a7a,0x20ab251,
        0x17ec549,0x175d777,0x004589a } },
    /* 126 */
    { { 0x15540a9,0x2ec5d2a,0x05b09fa,0x1bc058b,0x07cfb88,0x28f7b86,
        0x3e766be,0x189305e,0x01fe88e,0x23fdf69,0x0b919c3,0x02dc7ae,
        0x3f9a9ad,0x0b83cc7,0x0086a52 },
      { 0x28bc259,0x39bdca1,0x39e4bc8,0x0e0f33b,0x16130c6,0x2919955,
        0x31f4549,0x2fed027,0x30919b2,0x0a39b03,0x0ca7bb2,0x1711b24,
        0x3b67b94,0x05a136b,0x00acd87 } },
    /* 127 */
    { { 0x0c53841,0x31cb284,0x3ced090,0x06d5693,0x1c20ae0,0x0408d2b,
        0x37ebd5e,0x081900f,0x26a8589,0x0acfd0a,0x34a1472,0x2f0c302,
        0x124ccbd,0x10de328,0x00971bc },
      { 0x17ff2ff,0x27d1b54,0x147b6f7,0x38bb2ea,0x26a9c96,0x0a49448,
        0x39f2f46,0x247c579,0x3b16a4e,0x28c2a5a,0x2d4c72d,0x11f248c,
        0x1e4df11,0x047d604,0x0065bc3 } },
    /* 128 */
    { { 0x39b3239,0x1f75f44,0x3bae87c,0x139360c,0x18b5782,0x3ffc005,
        0x3c48789,0x2bc6af2,0x38b909e,0x223ff3b,0x31443a7,0x017d3bb,
        0x0bfed99,0x128b857,0x00020dd },
      { 0x306d695,0x25a7b28,0x2f60ca2,0x2b6e4f2,0x1df940c,0x1fa9b8e,
        0x37fab78,0x13f959f,0x10ff98c,0x38343b8,0x019cb91,0x11a1e6b,
        0x17ab4c6,0x1431f47,0x004b4ea } },
    /* 129 */
    { { 0x20db57e,0x102515e,0x170219e,0x2b66a32,0x1e6017c,0x2f973fe,
        0x3739e51,0x0e28b6f,0x3cda7a9,0x30d91ac,0x28350df,0x1444215,
        0x098b504,0x1bcd5b8,0x00ad3bd },
      { 0x22e3e3e,0x3aeaffb,0x26cb935,0x0091ce4,0x2fbd017,0x3a7ed6a,
        0x335b029,0x3bfc1f1,0x3852e3f,0x2b14a86,0x046b405,0x266af4c,
        0x3997191,0x33b0e40,0x00e306f } },
    /* 130 */
    { { 0x3e4712c,0x26bb208,0x18eed6d,0x1b30f06,0x27ca837,0x06faf62,
        0x1831873,0x3fbcf9b,0x3f3d88b,0x1fb55eb,0x0f44edc,0x29917bb,
        0x3151772,0x342d72e,0x00d4e63 },
      { 0x2ee0ecf,0x39e8733,0x2e8e98c,0x0cd4e0f,0x08f0126,0x1ad157a,
        0x079078a,0x23018ee,0x196c765,0x2b2f34f,0x0783336,0x075bf9c,
        0x3713672,0x098d699,0x00f21a7 } },
    /* 131 */
    { { 0x186ba11,0x22cf365,0x048019d,0x2ca2970,0x0d9e0ae,0x08c3bd7,
        0x261dbf2,0x2fc2790,0x1ee02e6,0x10256a7,0x00dc778,0x18dc8f2,
        0x157b189,0x2ebc514,0x005c97d },
      { 0x3c4503e,0x1d10d12,0x337097e,0x0c6169a,0x30fb1cb,0x3481752,
        0x0df2bec,0x19768fa,0x1bcf8f7,0x2925f74,0x2c988a1,0x3be571d,
        0x04cfa92,0x2ea9937,0x003f924 } },
    /* 132 */
    { { 0x268b448,0x06e375c,0x1b946bf,0x287bf5e,0x3d4c28b,0x138d547,
        0x21f8c8e,0x21ea4be,0x2d45c91,0x35da78e,0x00326c0,0x210ed35,
        0x1d66928,0x0251435,0x00fefc8 },
      { 0x0339366,0x216ff64,0x2c3a30c,0x3c5733d,0x04eeb56,0x2333477,
        0x32b1492,0x25e3839,0x1b5f2ce,0x0dcfba1,0x3165bb2,0x3acafcc,
        0x10abfcd,0x248d390,0x008106c } },
    /* 133 */
    { { 0x102f4ee,0x3c0585f,0x1225c8d,0x11c6388,0x08a7815,0x2b3e790,
        0x2895eb6,0x18cf53a,0x0b56e5a,0x2e2c003,0x3e981ff,0x0761b55,
        0x1bc32f3,0x0a7111d,0x00f5c80 },
      { 0x3568973,0x1587386,0x16ec764,0x20698a6,0x02f809b,0x2821502,
        0x113d64d,0x38c2679,0x15de61c,0x0309f60,0x272999e,0x29bfe64,
        0x173f70d,0x1de7fab,0x00bd284 } },
    /* 134 */
    { { 0x31cdf2b,0x0f0be66,0x2151603,0x01af17e,0x32a99cf,0x085dece,
        0x27d2591,0x1520df4,0x273c448,0x1ec7c54,0x102e229,0x355f604,
        0x2acb75f,0x005f1fd,0x003d43e },
      { 0x270eb28,0x22ec2ce,0x306b41a,0x238fa02,0x167de2d,0x030a379,
        0x245a417,0x1808c24,0x0b1a7b2,0x3ab5f6f,0x2cbc6c1,0x2c228d4,
        0x3041f70,0x2d9a6cc,0x00b504f } },
    /* 135 */
    { { 0x17a27c2,0x216ad7e,0x011ba8e,0x22f0428,0x16ac5ec,0x3ef3c58,
        0x345533f,0x0298155,0x2856579,0x0005e03,0x19ee75b,0x146fe16,
        0x29881e4,0x18ece70,0x008907a },
      { 0x20189ed,0x119ce09,0x35cb76d,0x0d91ef4,0x2284a44,0x032ad87,
        0x0e8c402,0x3c82b5d,0x38c416c,0x398992f,0x1fd820c,0x169b255,
        0x3b5fcfa,0x1343c92,0x00fa715 } },
    /* 136 */
    { { 0x33f5034,0x20b3b26,0x28fd184,0x16b3679,0x3962d44,0x15d1bc8,
        0x2fb1d69,0x1292c99,0x25a58c9,0x1b19ab7,0x2d68a5b,0x2f6a09b,
        0x0d6aedb,0x2935eac,0x0005664 },
      { 0x25e32fc,0x13f9440,0x3252bcd,0x2fea5b7,0x161a5ae,0x0564a8c,
        0x0a07e23,0x1545f62,0x0de9890,0x1d76765,0x1fd440e,0x2ed0041,
        0x3db4c96,0x1e8ba01,0x001b0c4 } },
    /* 137 */
    { { 0x0223878,0x29ab202,0x15585c2,0x1a79969,0x1ba08c2,0x2ef09ff,
        0x2b1b9b9,0x181f748,0x1bf72b9,0x224645c,0x2588dc5,0x2d157e7,
        0x22d939a,0x05b88d9,0x006d549 },
      { 0x31de0c1,0x23a4e0e,0x278f8da,0x1aa013c,0x1a84d18,0x0d185a5,
        0x0988ccd,0x2c32efd,0x3bee10e,0x37d7ab8,0x3f2a66e,0x3e2da3e,
        0x1b5701f,0x3d9f0c1,0x00a68da } },
    /* 138 */
    { { 0x0b2e045,0x0133fd1,0x05d4c10,0x0d92c70,0x391b5e1,0x2292281,
        0x2e40908,0x2ec694e,0x195ea11,0x29cfeca,0x3d93a4e,0x01215c0,
        0x08a5f32,0x37a0eff,0x00cce45 },
      { 0x2b3106e,0x12a5fb0,0x0b4faff,0x0c2da12,0x09069c6,0x35d8907,
        0x2837a6e,0x3db3fb6,0x3136cc3,0x222836b,0x3da018a,0x2741274,
        0x13ba319,0x1ac7642,0x00f867c } },
    /* 139 */
    { { 0x2527296,0x10a9595,0x178de4d,0x0f739c4,0x0ae26c7,0x3094599,
        0x20adac6,0x2b875c2,0x3ae5dc0,0x3e04d20,0x1aab2da,0x1d3ab37,
        0x15f4f75,0x0b730b5,0x00c56b5 },
      { 0x1f32923,0x2f059e5,0x2a89872,0x2056f74,0x04be175,0x1da67c0,
        0x17f1e7a,0x3780a6d,0x0723ac2,0x257f367,0x1237773,0x2bcee86,
        0x0b97f83,0x38aff14,0x00a64d4 } },
    /* 140 */
    { { 0x2552b40,0x0b6b883,0x12e8217,0x0974d35,0x062f497,0x1e563e6,
        0x30ee400,0x375d1e4,0x290751f,0x0d5b68a,0x353e48c,0x064a0d3,
        0x3c343f1,0x309a394,0x0034d2a },
      { 0x3111286,0x0f08604,0x1827107,0x0536a76,0x0201dac,0x3a574de,
        0x2c29dbe,0x382c7b0,0x1191f3e,0x324c5bc,0x144ce71,0x24327c1,
        0x1212778,0x22bc9d8,0x00d7713 } },
    /* 141 */
    { { 0x34ad1cd,0x1179b4e,0x1bc1780,0x1392a92,0x2cd86b9,0x359de85,
        0x251f1df,0x0da5d5f,0x135fa61,0x0f64a42,0x34f4d89,0x0fe564c,
        0x3cf9b7a,0x122d757,0x008c9c2 },
      { 0x370d4e9,0x0e9209b,0x0ae99f2,0x1518c64,0x0172734,0x2c20692,
        0x1d7c135,0x149c52f,0x38928d6,0x3c78b78,0x25841d1,0x2eaa897,
        0x372e50b,0x29e5d19,0x00c4c18 } },
    /* 142 */
    { { 0x13375ac,0x389a056,0x211310e,0x2f9f757,0x04f3288,0x103cd4e,
        0x17b2fb2,0x2c78a6a,0x09f1de6,0x23e8442,0x1351bc5,0x1b69588,
        0x285b551,0x0464b7e,0x00573b6 },
      { 0x0ba7df5,0x259a0db,0x2b4089e,0x05630a2,0x3f299be,0x350ff2f,
        0x1c9348a,0x3becfa4,0x3cc9a1c,0x17a6ef1,0x338b277,0x2b761d9,
        0x2aa01c8,0x3cb9dd7,0x006e3b1 } },
    /* 143 */
    { { 0x277788b,0x16a222d,0x173c036,0x310ff58,0x2634ae8,0x392636f,
        0x0987619,0x1e6acc1,0x26dc8f7,0x242310f,0x0c09aca,0x22b8e11,
        0x0d17006,0x1c2c806,0x002380c },
      { 0x297c5ec,0x1fef0e8,0x3948cf7,0x14f2915,0x2dacbc8,0x0dafb1f,
        0x10de043,0x31184da,0x06414ee,0x3c9aeeb,0x1f713ab,0x308f1f8,
        0x1569ed1,0x3f379bf,0x00f08bb } },
    /* 144 */
    { { 0x0770ee3,0x058fd21,0x17065f8,0x251d128,0x10e0c7f,0x06cb51b,
        0x0f05f7e,0x3666a72,0x3e7d01f,0x2d05fab,0x11440e5,0x28577d4,
        0x2fbcf2b,0x14aa469,0x00dc5c5 },
      { 0x270f721,0x1c75d28,0x085b862,0x1d68011,0x132c0a0,0x37be81d,
        0x1a87e38,0x083fa74,0x3acbf0d,0x16d6429,0x0feda1f,0x031070a,
        0x2ec2443,0x21e563d,0x00454d2 } },
    /* 145 */
    { { 0x0525435,0x1e98d5f,0x3dbc52b,0x1fcdf12,0x13d9ef5,0x3ff311d,
        0x393e9ed,0x3cef8ae,0x2987710,0x3bdee2e,0x21b727d,0x3ba1b68,
        0x10d0142,0x3c64b92,0x0055ac3 },
      { 0x0c1c390,0x38e9bb0,0x1e7b487,0x11511b3,0x1036fb3,0x25aba54,
        0x1eb2764,0x048d022,0x0d971ed,0x1bb7fb5,0x100f0b4,0x06c3756,
        0x2f0d366,0x3c6e160,0x0011bd6 } },
    /* 146 */
    { { 0x36bc9d1,0x24d43c1,0x12c35cf,0x2fb3cf3,0x015d903,0x16bc0c7,
        0x0fc8c22,0x3195c87,0x2488b1c,0x1f82b4c,0x30014e8,0x27ee58d,
        0x31658dd,0x1684a5f,0x00f0f3a },
      { 0x1f703aa,0x023eebc,0x20babb9,0x080bd9d,0x12f9cc4,0x1a8e2d4,
        0x0eec666,0x1176803,0x33005d6,0x1137b68,0x37de339,0x33d71cb,
        0x0c906b9,0x14086b5,0x00aeef6 } },
    /* 147 */
    { { 0x219045d,0x0f22c5e,0x024c058,0x00b414a,0x0ae7c31,0x3db3e96,
        0x234979f,0x0cf00a8,0x3c962c7,0x27fa77f,0x1c0c4b0,0x1fe8942,
        0x218053a,0x1eed3f8,0x0051643 },
      { 0x2a23ddb,0x138f570,0x104e945,0x21ca270,0x30726d8,0x3f45490,
        0x37d9184,0x242ea25,0x33f6d77,0x3f15679,0x065af85,0x34fa1f5,
        0x2e46b8f,0x31d17fb,0x00a2615 } },
    /* 148 */
    { { 0x335167d,0x181ea10,0x0887c8d,0x01383d7,0x18b42d8,0x263447e,
        0x1f13df3,0x0319d7e,0x0872074,0x2d6aa94,0x23d9234,0x36a69aa,
        0x0bad183,0x3138a95,0x00bd3a5 },
      { 0x1b0f658,0x0e4530b,0x373add1,0x1b968fc,0x329dcb6,0x09169ca,
        0x162df55,0x0211eff,0x02391e4,0x3867460,0x3136b1a,0x37dd36e,
        0x3bc5bd9,0x2dacfe4,0x0072a06 } },
    /* 149 */
    { { 0x119d96f,0x067b0eb,0x00996da,0x293eca9,0x2b342da,0x1889c7a,
        0x21633a6,0x0152c39,0x281ce8c,0x18ef3b3,0x0bd62dc,0x3238186,
        0x38d8b7c,0x3867b95,0x00ae189 },
      { 0x0ed1eed,0x1e89777,0x13ab73e,0x029e1d7,0x2c1257f,0x33fbc09,
        0x32d5a21,0x3d870b2,0x39bb1fd,0x33663bc,0x24e83e6,0x239bda4,
        0x3088bcd,0x01db1ed,0x00d71e7 } },
    /* 150 */
    { { 0x14245bf,0x0da0c27,0x153b339,0x05cab0a,0x122d962,0x1b0f0f3,
        0x3f5a825,0x267a2ce,0x2910d06,0x254326f,0x0f36645,0x025118e,
        0x37c35ec,0x36e944e,0x006c056 },
      { 0x05ab0e3,0x29aa0c1,0x1295687,0x1fd1172,0x08d40b5,0x05bd655,
        0x345048a,0x02a1c3c,0x2393d8f,0x0992d71,0x1f71c5e,0x18d4e8a,
        0x30dd410,0x11d61d3,0x00dd58b } },
    /* 151 */
    { { 0x2230c72,0x30213d8,0x05e367e,0x329204e,0x0f14f6c,0x3369ddd,
        0x0bb4074,0x2edafd6,0x1b1aa2d,0x0785404,0x0c035ab,0x220da74,
        0x1f2fdd4,0x092a091,0x00ef83c },
      { 0x3dc2538,0x1cca3e7,0x246afb5,0x24c647f,0x0798082,0x0bb7952,
        0x0f5c443,0x008b38a,0x299ea1a,0x3c6cf36,0x3df2ec7,0x398e6dc,
        0x29a1839,0x1cadd83,0x0077b62 } },
    /* 152 */
    { { 0x25d56d5,0x3546f69,0x16e02b1,0x3e5fa9a,0x03a9b71,0x2413d31,
        0x250ecc9,0x1d2de54,0x2ebe757,0x2a2f135,0x2aeeb9a,0x0d0fe2b,
        0x204cb0e,0x07464c3,0x00c473c },
      { 0x24cd8ae,0x0c86c41,0x221c282,0x0795588,0x1f4b437,0x06fc488,
        0x0c81ecd,0x020bf07,0x3a9e2c8,0x2294a81,0x3a64a95,0x0363966,
        0x32c9a35,0x0f79bec,0x0029e4f } },
    /* 153 */
    { { 0x289aaa5,0x2755b2e,0x059e0aa,0x3031318,0x0f0208a,0x35b7729,
        0x00d9c6b,0x3dd29d0,0x075f2c2,0x0ece139,0x31562dd,0x04187f2,
        0x13b8d4c,0x0920b85,0x003924e },
      { 0x09808ab,0x2e36621,0x2a36f38,0x1829246,0x229bf32,0x20883b7,
        0x159ada8,0x3108a14,0x15bbe5b,0x1e2d1e4,0x1730096,0x0d35cbb,
        0x15d0da9,0x0e60b94,0x00c4f30 } },
    /* 154 */
    { { 0x31de38b,0x27b9086,0x2760e3e,0x169098d,0x2a124e2,0x00596c6,
        0x3f73c09,0x0d31642,0x2341464,0x248600a,0x2e1fa10,0x2aa0fc8,
        0x051e954,0x00f3b67,0x001d4bd },
      { 0x18751e6,0x25a8e1e,0x07f5c2d,0x17e30d4,0x0ed2723,0x23093e2,
        0x3b80e2c,0x13de2d7,0x2fad37f,0x1be1cfb,0x3224ba9,0x0a7f5d3,
        0x1714972,0x06667b7,0x009dcd9 } },
    /* 155 */
    { { 0x294f22a,0x3e06993,0x0341ee9,0x24bdc7b,0x2e56098,0x2660a13,
        0x018ddda,0x2c261b2,0x2953b54,0x267f51c,0x0e8a7cc,0x29ab00c,
        0x3a38247,0x397ac81,0x00de684 },
      { 0x36b956b,0x347b34a,0x35834bd,0x053c06c,0x0090844,0x148cec5,
        0x380b325,0x2f17b8b,0x054ef5e,0x09683fb,0x3f8b29a,0x33c979a,
        0x1e01474,0x3e81fca,0x001c757 } },
    /* 156 */
    { { 0x30fdfe4,0x2d712ba,0x13671bc,0x2cfc226,0x3d7c649,0x16f020e,
        0x368e3f0,0x2981ebb,0x246a78a,0x115e81b,0x21223a4,0x04dbb30,
        0x1a50ba2,0x12114bd,0x0089bd6 },
      { 0x055f15a,0x1046e51,0x00fd724,0x1c022a7,0x323dfa9,0x36d8efb,
        0x0da4d16,0x0910dec,0x2c1fb16,0x2dbe29f,0x298284f,0x2b273bb,
        0x26022c1,0x20accd5,0x00085a5 } },
    /* 157 */
    { { 0x01f138a,0x2d87e7b,0x0c2815c,0x0c19a3c,0x311c9a2,0x3e4fce3,
        0x029729d,0x21236b2,0x2984048,0x3f3bc95,0x2bba8fb,0x1a1b680,
        0x0619a3f,0x29e0447,0x00ed5fe },
      { 0x2d1c833,0x3dcef35,0x3f809b4,0x01a1b9e,0x1509516,0x10ac754,
        0x2735080,0x27b0a8a,0x2495fb8,0x0a7bdba,0x1ef8b89,0x00233a5,
        0x0568bf1,0x1a126ba,0x0078a7e } },
    /* 158 */
    { { 0x0470cd8,0x20e9f04,0x30003fe,0x20be1b7,0x1927346,0x2a5026d,
        0x1ac06bd,0x2717ed7,0x2609493,0x3079ea5,0x1cc116d,0x31b0541,
        0x2c8ccde,0x10219ae,0x001a52b },
      { 0x2864045,0x0e8d95b,0x2fc1530,0x0aa44e7,0x345eae7,0x3cc7553,
        0x3ec6466,0x229b60e,0x06f6e95,0x00bed2a,0x0ff4403,0x181c639,
        0x2e0df67,0x1f8fa46,0x0000811 } },
    /* 159 */
    { { 0x04310a2,0x20cee8e,0x09fc5d5,0x3707f5b,0x0bdfb4e,0x12713ee,
        0x24f1028,0x0787ee6,0x39a581c,0x3797ec8,0x10a9746,0x112cb9f,
        0x142b9ba,0x1da0ef6,0x0078f7b },
      { 0x07607ae,0x3232872,0x2a7e076,0x0bb572a,0x182b23c,0x1d8f918,
        0x181f392,0x37c45a9,0x24a3886,0x0b2a297,0x264e7f2,0x1fa433c,
        0x0fcfcc8,0x21c0857,0x0004f74 } },
    /* 160 */
    { { 0x01d161c,0x1744585,0x2d17528,0x03a4f13,0x267cd2e,0x30d861f,
        0x062a647,0x213284b,0x139ed25,0x27d4ca5,0x02fbbd6,0x31ddf11,
        0x3c50ac4,0x1dd86f7,0x00107de },
      { 0x16beebd,0x1b7317a,0x2151997,0x256a196,0x3be2aff,0x3621cab,
        0x0a9da19,0x05f3038,0x23da63c,0x3178d5e,0x215cc67,0x07f7f63,
        0x0c6d8d3,0x3bf5e5c,0x00c44bb } },
    /* 161 */
    { { 0x00c62f1,0x3e0f893,0x1572703,0x3b93865,0x19b1e28,0x389b33b,
        0x02858bf,0x0e3e9aa,0x04bc436,0x234e072,0x25ba43d,0x3dca19e,
        0x0274394,0x20f442e,0x003b4a7 },
      { 0x176451e,0x2b5ed5d,0x35c8ee1,0x25c52da,0x0c3d0b5,0x32b306e,
        0x030954f,0x275ecf7,0x10e472c,0x21577c4,0x02f8a32,0x321bb5c,
        0x0098f97,0x104e237,0x00d0433 } },
    /* 162 */
    { { 0x0a8f2fe,0x034548b,0x141f1a6,0x121246f,0x1616409,0x237f80d,
        0x2e29a55,0x1218db6,0x3ea278e,0x1669856,0x1ad7c8e,0x36d11de,
        0x2c2fcbb,0x18c0b3a,0x001c706 },
      { 0x1699b4b,0x2d531a6,0x17e85e2,0x1b48e78,0x2b509ca,0x2818ea0,
        0x0165fee,0x0b809ca,0x09db6a2,0x3dad798,0x326ee1d,0x204e416,
        0x091fa12,0x1c890e5,0x0007b9f } },
    /* 163 */
    { { 0x0ff4e49,0x0bb0512,0x0129159,0x05db591,0x03e4e9f,0x055ab30,
        0x0f82881,0x0ac2deb,0x3a8bb09,0x356a8d2,0x3d38393,0x03e4089,
        0x38187cd,0x1377a93,0x0041672 },
      { 0x0139e73,0x3990730,0x187d3c4,0x33e4793,0x2e0fe46,0x2ad87e2,
        0x33c792c,0x21d4fb6,0x1e4d386,0x2932d1b,0x20f1098,0x1270874,
        0x0ea6ee4,0x0167d6e,0x005e5fd } },
    /* 164 */
    { { 0x1856031,0x2b7519d,0x3bd07fc,0x337abcb,0x089c7a4,0x2a1f120,
        0x3523ce7,0x2ba406b,0x09561d9,0x1797f04,0x3cdb95f,0x2d6193e,
        0x32c7d3f,0x223aed6,0x00beb51 },
      { 0x2e65825,0x158f0ce,0x16413d1,0x310395f,0x3116854,0x250baf4,
        0x373d341,0x156cc47,0x104c069,0x0893716,0x195a0a6,0x035320e,
        0x37b7d8a,0x21b5755,0x00fb26b } },
    /* 165 */
    { { 0x286ae17,0x04239f1,0x1a56c53,0x0e74707,0x29090d7,0x2bb142b,
        0x03b0139,0x1aac916,0x08ba49a,0x0376682,0x3382f85,0x064bbab,
        0x2910e28,0x1d5bd7f,0x00cc8df },
      { 0x0ab7630,0x208e8e7,0x3fc1877,0x26bee39,0x264984a,0x192ff05,
        0x08ef9c3,0x0aa6951,0x071c44e,0x26eed3e,0x035c95e,0x06906ad,
        0x10a0690,0x397eaa9,0x00c6c23 } },
    /* 166 */
    { { 0x034d8dd,0x005b064,0x279bb78,0x12c2c4f,0x1856bb4,0x0c90681,
        0x06409ab,0x3b48617,0x19a2d78,0x0a34bf8,0x326eddf,0x31f09b5,
        0x04f04dc,0x3d7c944,0x003ccaf },
      { 0x321f843,0x35fb71a,0x1e4c397,0x377a5d7,0x2da88e4,0x3d6ada7,
        0x33d3964,0x1b30149,0x0e39aae,0x054dda0,0x3e6f946,0x1273394,
        0x3ffd3f7,0x2f6655e,0x00021dd } },
    /* 167 */
    { { 0x37233cf,0x11617dd,0x26f07b6,0x3d8250a,0x0fe6771,0x3f9bbbc,
        0x2aba7ad,0x200a58d,0x3568603,0x198eefa,0x1e8fcf3,0x3b9610b,
        0x20524ac,0x2a67528,0x0048d9a },
      { 0x1a5e57a,0x1e9d303,0x16c9cff,0x0f39527,0x3c23259,0x03c8a1e,
        0x104bccf,0x182d5a1,0x18dbc83,0x05b5f42,0x1b402f4,0x317c525,
        0x11bf1ea,0x3c46e1f,0x0061936 } },
    /* 168 */
    { { 0x0153a9d,0x36859ee,0x2cf0aa9,0x2b27a0f,0x0a49fe3,0x2d984e1,
        0x018f8e1,0x1378453,0x1ab3843,0x1987093,0x283dae9,0x25cf0e8,
        0x14fc93d,0x280609d,0x00c99ba },
      { 0x026b1e3,0x34663d3,0x2202477,0x21a9d45,0x212e8e1,0x18ab77e,
        0x2e52f63,0x0a14ce1,0x295c396,0x00c7a3d,0x2aaedb6,0x30abc4d,
        0x374acde,0x1318a73,0x00fcfdb } },
    /* 169 */
    { { 0x0a40298,0x3ba5633,0x11956b3,0x14fcbd7,0x3c38781,0x34bab96,
        0x165630e,0x1f3c831,0x37e3a69,0x2b4226c,0x2d5029e,0x3b4ab1e,
        0x1da6ac2,0x3eb43c3,0x007e5cd },
      { 0x1b86202,0x109b7f6,0x2054f98,0x2c50cd7,0x2ed1960,0x3c518e7,
        0x1b02463,0x319c07f,0x1c30db6,0x045fdc2,0x373421e,0x31a1eb9,
        0x1a8acbf,0x31289b0,0x0013fef } },
    /* 170 */
    { { 0x3fa0a5f,0x068661f,0x2109e36,0x00b18ff,0x1f4b261,0x31d3844,
        0x0acbc56,0x3aebc99,0x1fa77ab,0x152bd11,0x24cddb7,0x2313f74,
        0x06eea44,0x15f5114,0x000b131 },
      { 0x2e9993d,0x1ac565c,0x2cbe22a,0x3921797,0x12c3c57,0x360f868,
        0x33560bf,0x320ee99,0x382c3b8,0x39af88f,0x00bbe38,0x2c4ea59,
        0x3399b40,0x00ceb45,0x0066eea } },
    /* 171 */
    { { 0x0c6c693,0x31ba56d,0x3d3849f,0x378dabd,0x0efc735,0x17f90bf,
        0x13343d3,0x2df0f81,0x27c6a9a,0x13c2a90,0x0a0fcb2,0x27c10d9,
        0x3bc50c7,0x090e4fa,0x0016287 },
      { 0x2927e1e,0x35af405,0x184c5c3,0x3499cee,0x240158e,0x33522e6,
        0x386fc84,0x0a0b69f,0x1a660ea,0x34590fb,0x22a1bee,0x2ce4fab,
        0x31a9445,0x0e78655,0x00664c8 } },
    /* 172 */
    { { 0x3eeaf94,0x115d409,0x21e7577,0x097aa67,0x22875c9,0x021ab7a,
        0x27e7ba5,0x1093f04,0x2a086fe,0x05d9494,0x2b6c028,0x10f31b0,
        0x1312d11,0x262759c,0x00c9bb2 },
      { 0x1acb0a5,0x30cdf14,0x0f78880,0x0574f18,0x1a37109,0x098adbb,
        0x2113c09,0x2060925,0x1f89ce4,0x1974976,0x3381358,0x2dab5ca,
        0x2159c53,0x3af1303,0x000ea3b } },
    /* 173 */
    { { 0x1e49bea,0x29142b1,0x1a59cab,0x055f017,0x0684e54,0x39eb0db,
        0x29cab9d,0x255ee8b,0x35f2e6f,0x05329e6,0x09b817b,0x1ec091c,
        0x1df0fef,0x2641f62,0x00eb304 },
      { 0x2fe5096,0x3dcc1d1,0x2aaf508,0x3a0b813,0x0695810,0x144bddb,
        0x2f1bd93,0x281ae23,0x3513ebc,0x1ddd984,0x0cf158b,0x35218eb,
        0x257daf7,0x391253b,0x00b2a81 } },
    /* 174 */
    { { 0x153e6ba,0x22396db,0x0ea2ff2,0x2a45121,0x0a90de1,0x34cf23b,
        0x2db60ce,0x1a900be,0x2f328b6,0x355e75b,0x2c24372,0x0b75b77,
        0x2ec7d4f,0x3f24759,0x00e9e33 },
      { 0x39eab6e,0x2267480,0x3b5e110,0x1e8fa5e,0x2a31a66,0x3f739a3,
        0x00166dc,0x3552d88,0x3ae5137,0x3efa0fa,0x0800acd,0x17df61d,
        0x38c8608,0x04cc31b,0x00cf4ab } },
    /* 175 */
    { { 0x31e08fb,0x1961164,0x22c003f,0x078541b,0x3643855,0x30da587,
        0x11f0dc9,0x324595e,0x329e3dc,0x29a041e,0x3495d2c,0x0908dd3,
        0x1895b83,0x198dbb9,0x00d8cfb },
      { 0x0349b1b,0x383c5a8,0x2b86525,0x1b1283e,0x133cd2c,0x2be376a,
        0x012ee82,0x1eb4d1b,0x0ba71e9,0x01f3109,0x37621eb,0x1d9b77c,
        0x0d39069,0x3d5a97c,0x0095565 } },
    /* 176 */
    { { 0x20f5e94,0x1eefc86,0x1327e0e,0x054760b,0x2f771e1,0x3ac447e,
        0x033e3dc,0x198e040,0x04dd342,0x1b49a5d,0x00d01ef,0x3cb6768,
        0x1ceafbd,0x31c6812,0x001cb80 },
      { 0x221c677,0x060ca27,0x398b17f,0x0146723,0x36452af,0x02d9e65,
        0x39c5f78,0x3cf50d6,0x0be40f8,0x2970b87,0x26d667c,0x3e45959,
        0x16e7943,0x01673e7,0x009faaa } },
    /* 177 */
    { { 0x2078fe6,0x0918602,0x11dd8ad,0x399193f,0x0f6cc73,0x0f8dd12,
        0x2ce34dc,0x06d7d34,0x0c5e327,0x0989254,0x2fc5af7,0x2443d7b,
        0x32bc662,0x2fe2a84,0x008b585 },
      { 0x039327f,0x08e616a,0x252f117,0x1f52ab0,0x234e2d2,0x0a5b313,
        0x2f59ef6,0x0f7a500,0x15c4705,0x2c02b81,0x28b4f09,0x08aa5c8,
        0x0180efc,0x0993e83,0x00a9e86 } },
    /* 178 */
    { { 0x0310ecc,0x2d8892f,0x14ed0b7,0x3c59fe8,0x08a1a74,0x0850e57,
        0x1d09607,0x044a21f,0x109f5c9,0x237c6cf,0x06b264a,0x3fc8f1a,
        0x0d4c539,0x2740f96,0x00dc2d4 },
      { 0x1d6f501,0x0adf4ea,0x14f7215,0x0930102,0x3f4c32e,0x24e2643,
        0x366596d,0x081ff18,0x38f94fb,0x2c21341,0x328594c,0x267c75c,
        0x196b3fd,0x29932cb,0x0036def } },
    /* 179 */
    { { 0x3ed7cbe,0x26de044,0x3d0e461,0x0565e12,0x295e500,0x31dc17f,
        0x32251c2,0x3420ca8,0x3995f0d,0x2e8ddab,0x0361a45,0x10971b0,
        0x11e7b55,0x33bc7ca,0x00812d2 },
      { 0x3d94972,0x1606817,0x0383ccf,0x0e795b7,0x026e20e,0x0f6fefc,
        0x13685d6,0x315d402,0x0cc36b8,0x1c7f059,0x390ef5e,0x316ae04,
        0x08c66b9,0x2fac9a4,0x0040086 } },
    /* 180 */
    { { 0x3e3c115,0x153de4d,0x1a8ae5e,0x2330511,0x169b8ee,0x1d965c2,
        0x2edff2b,0x3ef99e6,0x1631b46,0x1f8a238,0x118d7bb,0x12113c3,
        0x26424db,0x0f4122a,0x00e0ea2 },
      { 0x3d80a73,0x30393bc,0x0f98714,0x278ef59,0x087a0aa,0x3b18c20,
        0x04b8a82,0x2068e21,0x030255d,0x3382b27,0x055397f,0x05448dd,
        0x2015586,0x1190be0,0x000b979 } },
    /* 181 */
    { { 0x2e03080,0x2895692,0x09fb127,0x2d1602a,0x1232306,0x105bd4e,
        0x28cd6a6,0x0a83813,0x1ee13b0,0x2abadc3,0x0c09684,0x00e33e1,
        0x033eea3,0x30f0a39,0x00a710e },
      { 0x01b1f7d,0x1c959da,0x017077a,0x254bf0a,0x086fbce,0x15cd6b2,
        0x008683f,0x23a4f4d,0x22a6bd6,0x14e8c93,0x0027d15,0x31d0d4f,
        0x271777e,0x1533510,0x00ab603 } },
    /* 182 */
    { { 0x34c209d,0x14d0abb,0x270432a,0x1d02358,0x22ba752,0x209757f,
        0x34af6fc,0x1ffc52e,0x1ced28e,0x1870e46,0x1e0340f,0x3f0bf73,
        0x33ba91d,0x2ebca7c,0x00c6580 },
      { 0x1d442cb,0x0879d50,0x24e4ae1,0x3f4e91c,0x04c7727,0x093cd1d,
        0x16d6a45,0x10a8b95,0x0c77856,0x361f84f,0x217845f,0x0bbeec6,
        0x0485718,0x33c5385,0x00dcec0 } },
    /* 183 */
    { { 0x1539819,0x225507a,0x1bf11cb,0x13e7653,0x0c8cb3b,0x05f695e,
        0x353f634,0x2827874,0x3fb8053,0x22de9a5,0x035d8b7,0x2105cc7,
        0x2a7a98d,0x35bed95,0x0085748 },
      { 0x1859c5d,0x00e51f0,0x22a21fd,0x3054d74,0x06ce965,0x328eab7,
        0x26a13e0,0x13bfc65,0x01d4fb1,0x36600b9,0x36dd3fc,0x01232ed,
        0x15bbaa9,0x0ad7a51,0x0089b18 } },
    /* 184 */
    { { 0x3360710,0x1eb5a90,0x136bd77,0x3bd57a6,0x0841287,0x12886c9,
        0x35c6700,0x21bc6eb,0x25f35ad,0x3bcb01c,0x0707e72,0x23e9943,
        0x03e5233,0x34bb622,0x002bf8e },
      { 0x16e0d6a,0x04b3d2d,0x290cb02,0x049a10c,0x350537e,0x22cf71b,
        0x3184a19,0x2dc8b62,0x2350210,0x3b4afa6,0x159781e,0x1d01b6d,
        0x1853440,0x16442f0,0x005a78d } },
    /* 185 */
    { { 0x348b02c,0x1ea8ab5,0x3b954d5,0x14684ac,0x0be5b34,0x11c4496,
        0x0a7a456,0x14f6eb7,0x11a3221,0x2d65f82,0x32eb1ea,0x09c4018,
        0x3f301f3,0x32e8a1c,0x00bd9ad },
      { 0x0543f7f,0x31e744e,0x1fefd1d,0x24a486c,0x1000220,0x3977e3b,
        0x1b3ef51,0x2512a1b,0x2049e6b,0x122232b,0x391a32b,0x2f4a7b1,
        0x1c13e71,0x081a9b4,0x00d3516 } },
    /* 186 */
    { { 0x1924f43,0x1ae5495,0x28d52ef,0x2b93e77,0x2d2f401,0x371a010,
        0x33e8d7a,0x06ed3f1,0x30c0d9d,0x2589fa9,0x3bf3567,0x2ecf8fa,
        0x2dee4c3,0x152b620,0x007e8a2 },
      { 0x1924407,0x01bd42d,0x044a089,0x18686b5,0x2f14a0e,0x17cdce3,
        0x0efa216,0x3c586a8,0x1d6ae71,0x375831f,0x3175894,0x20e43eb,
        0x34c009e,0x3480527,0x00d115c } },
    /* 187 */
    { { 0x12abf77,0x38b0769,0x25682f2,0x295508c,0x0c2a0dc,0x1259b73,
        0x023ea25,0x340e7b5,0x3c7cd0d,0x1f92324,0x176405c,0x1528894,
        0x18f2e1e,0x2c59c35,0x001efb5 },
      { 0x0fb1471,0x07e7665,0x141da75,0x07d9f4a,0x0fdb31e,0x0dccda6,
        0x074eb25,0x3d92a9b,0x11189a0,0x1b4c557,0x24b8d2b,0x0533f92,
        0x0e9e344,0x2fa3dea,0x008d5a4 } },
    /* 188 */
    { { 0x2669e98,0x1ad3514,0x2a035c9,0x08a3f50,0x24547f9,0x0a145d3,
        0x1c1319d,0x3fe833d,0x1ae064b,0x1e01734,0x246d27e,0x3a2f13c,
        0x01e1150,0x263f55e,0x00f89ef },
      { 0x2e0b63f,0x3e57db7,0x23a4b4f,0x11c8899,0x0ad8500,0x348f3a7,
        0x2918604,0x27d6409,0x1ce5001,0x38f94c2,0x29a508a,0x39bdc89,
        0x3a52c27,0x194899e,0x00e9376 } },
    /* 189 */
    { { 0x0368708,0x34a2730,0x2e1da04,0x0bd78c1,0x2c45887,0x0c44bfa,
        0x3a23de3,0x390b9db,0x1746efd,0x05c638e,0x1d20609,0x3263370,
        0x31987f0,0x2988529,0x005fa3c },
      { 0x0aa9f2a,0x20622f7,0x060deee,0x0c9626a,0x3312cc7,0x18ebac7,
        0x008dd6c,0x0ad4fe6,0x3db4ea6,0x1dc3f50,0x090b6e9,0x0aff8d2,
        0x26aa62c,0x18f3e90,0x00105f8 } },
    /* 190 */
    { { 0x38059ad,0x25e576c,0x3ea00b2,0x1fa4191,0x25686b7,0x2d1ce8f,
        0x30470ed,0x3478bbf,0x340f9b6,0x1c9e348,0x3d594ec,0x2ffe56e,
        0x3f23deb,0x0cd34e9,0x00f4b72 },
      { 0x1a83f0b,0x2166029,0x28b32a2,0x06a5c5a,0x20786c4,0x0944604,
        0x0901bd2,0x379b84e,0x221e2fe,0x0346d54,0x1f4eb59,0x01b8993,
        0x2462e08,0x25f9d8b,0x006c4c8 } },
    /* 191 */
    { { 0x0b41d9d,0x2e417ed,0x265bd10,0x199148e,0x3826ca4,0x1a67e8d,
        0x1bbd13b,0x23e414d,0x3d773bc,0x356e64c,0x0d2118a,0x0cb587f,
        0x25fd093,0x24fb529,0x00158c6 },
      { 0x2806e63,0x3ecaa39,0x251b4dd,0x3b2d779,0x2e31ed3,0x066f1a6,
        0x060e518,0x2c7e3e5,0x0d62c76,0x0d88a70,0x101970a,0x1e3c8c6,
        0x272b8bb,0x083e73b,0x0031f38 } },
    /* 192 */
    { { 0x09e1c72,0x072bcb0,0x0cf4e93,0x2604a64,0x00715f2,0x10c98b6,
        0x2ad81d9,0x234fcce,0x37a7304,0x1974a4a,0x1c7415f,0x14aaa93,
        0x19587b1,0x3f643f4,0x00c3d10 },
      { 0x1ddadd0,0x2cd715d,0x294cf76,0x14479ed,0x19f5f4a,0x0198c09,
        0x1ab7ebc,0x182c0bc,0x0879202,0x1807273,0x05d39da,0x2c7d868,
        0x29c4ec4,0x1b13ad2,0x006dcd7 } },
    /* 193 */
    { { 0x1c83f01,0x0245bff,0x24f90ba,0x112554f,0x2354c8b,0x3f17988,
        0x0c511af,0x39e1e9b,0x26ae95b,0x0ae551c,0x35b41a6,0x0120455,
        0x1e989cb,0x1b37aff,0x00fa2ae },
      { 0x324659a,0x1aef1c3,0x1c43637,0x3f530a2,0x313a999,0x326af62,
        0x134184e,0x2ac131c,0x3f6a789,0x30a300a,0x13e526e,0x2107af3,
        0x093a8ff,0x2479902,0x00442b1 } },
    /* 194 */
    { { 0x22b6e20,0x31b18be,0x18614ca,0x26fdb5a,0x197f29e,0x325b44b,
        0x0ab1dbb,0x042348a,0x3275e8e,0x15bae44,0x0077124,0x2cf5345,
        0x2803ad4,0x188f2a2,0x0061b20 },
      { 0x2a560b1,0x3ced069,0x3cf42c2,0x100e167,0x3879e1d,0x0936ff0,
        0x1b51450,0x14c55f3,0x3153bfa,0x2957423,0x2a93823,0x15f5dce,
        0x2c9a22f,0x16731a8,0x00a97f2 } },
    /* 195 */
    { { 0x18edbbb,0x18c5ef9,0x1f13c30,0x071e77f,0x225ade5,0x1b60f75,
        0x1beaf11,0x3e495ad,0x2441dd8,0x2fa00e2,0x32a87b6,0x00050f2,
        0x038de7f,0x0037d6d,0x00a885d },
      { 0x39e48bd,0x1d9e433,0x2768e9f,0x3c29458,0x3f0bdf9,0x35ed5f2,
        0x36709fa,0x176dc10,0x012f7c1,0x2df8547,0x1d90ee3,0x053c089,
        0x21a8d35,0x200cb0d,0x002e84e } },
    /* 196 */
    { { 0x23ec8d8,0x1d81f55,0x0cb7227,0x07f8e4d,0x2a66181,0x163f577,
        0x272e7af,0x131a8f2,0x2046229,0x25e6276,0x36bbefe,0x2cdc22f,
        0x17c8288,0x33dd4fb,0x000d524 },
      { 0x330c073,0x1a6728b,0x1cf369f,0x12e7707,0x2f0fa26,0x17c2abd,
        0x0a45680,0x26ebd13,0x3c7d19b,0x1c3d6c8,0x2abd110,0x064fd07,
        0x09b8339,0x02b4a9f,0x009e3e1 } },
    /* 197 */
    { { 0x0ae972f,0x2093c35,0x06e7a90,0x0af1ba1,0x243eef0,0x2748582,
        0x0606122,0x13a45f9,0x0acfe60,0x08a685e,0x0eb184b,0x015bc11,
        0x0cdf423,0x157fad5,0x004fcad },
      { 0x2728d15,0x3e5bceb,0x0331a0f,0x31b1a80,0x28a2680,0x3b94955,
        0x04cae07,0x176b57e,0x03ac5a6,0x3d7918b,0x22d23f4,0x0ae077f,
        0x1eb075d,0x006f16c,0x006e473 } },
    /* 198 */
    { { 0x38219b9,0x0475a2b,0x107a774,0x39946c6,0x1cb883c,0x004e0ed,
        0x087e571,0x25c3497,0x059982f,0x0a71f66,0x118305d,0x1aaf294,
        0x3a5dbaa,0x34be404,0x00725fe },
      { 0x3abd109,0x336ebea,0x2528487,0x15a1d61,0x0c0f8cf,0x2b56095,
        0x2591e68,0x3549a80,0x1d1debb,0x0701c6c,0x161e7e3,0x1f7fa2e,
        0x3dfe192,0x17e6498,0x0055f89 } },
    /* 199 */
    { { 0x175645b,0x26c036c,0x0b92f89,0x09ed96d,0x351f3a6,0x19ce67b,
        0x33ac8db,0x2f0828b,0x27fe400,0x0b9c5e1,0x1967b95,0x3324080,
        0x11de142,0x1d44fb3,0x003d596 },
      { 0x3979775,0x3af37b6,0x3e88d41,0x2f1a8b9,0x299ba61,0x085413c,
        0x1149a53,0x0beb40e,0x31427ba,0x239f708,0x357d836,0x1558c22,
        0x280a79f,0x1b255f6,0x002b6d1 } },
    /* 200 */
    { { 0x39ad982,0x3d79d89,0x01a684a,0x0b6722e,0x39bb4c9,0x39a6399,
        0x1ad44e0,0x3059f5e,0x048265f,0x33a2fa4,0x0c3a4cc,0x0d7df98,
        0x23a33f1,0x34e2e21,0x00a0a10 },
      { 0x386efd9,0x1c91f34,0x06c2e19,0x3e6d48d,0x00eefd3,0x2181ef2,
        0x2415f97,0x1d33b08,0x0625086,0x1e8aa3e,0x08c9d60,0x0ab427b,
        0x2764fa7,0x3b7943e,0x00cd9f0 } },
    /* 201 */
    { { 0x1a46d4d,0x0e471f4,0x1693063,0x0467ac0,0x22df51c,0x127a0f7,
        0x0498008,0x20e0b16,0x1aa8ad0,0x1923f42,0x2a74273,0x01761ce,
        0x1600ca4,0x187b87e,0x00ee49e },
      { 0x0c76f73,0x19daf92,0x0b2ad76,0x3d8049d,0x1d9c100,0x0fe1c63,
        0x0bb67c8,0x035cc44,0x02002fc,0x37b2169,0x344656a,0x1127879,
        0x1939bc0,0x0dd8df6,0x0028ce7 } },
    /* 202 */
    { { 0x0544ac7,0x26bdc91,0x042697e,0x356e804,0x1f2c658,0x2ceb7ef,
        0x2dec39f,0x02c1dcc,0x391a2df,0x2344beb,0x2171e20,0x3099c94,
        0x0fa548a,0x37216c9,0x00f820c },
      { 0x0f4cf77,0x29bbaa5,0x33c6307,0x34a5128,0x118c783,0x2dd06b1,
        0x139d4c0,0x2db912e,0x1153ffb,0x1075eb3,0x3a255e4,0x2892161,
        0x36d5006,0x125338c,0x0014fbc } },
    /* 203 */
    { { 0x1584e3c,0x0830314,0x00279b9,0x167df95,0x2c7733c,0x2108aef,
        0x0ce1398,0x35aaf89,0x012523b,0x3c46b6a,0x388e6de,0x01a2002,
        0x0582dde,0x19c7fa3,0x007b872 },
      { 0x1e53510,0x11bca1f,0x19684e7,0x267de5c,0x2492f8b,0x364a2b0,
        0x080bc77,0x2c6d47b,0x248432e,0x3ace44f,0x32028f6,0x0212198,
        0x2f38bad,0x20d63f0,0x00122bb } },
    /* 204 */
    { { 0x30b29c3,0x3cec78e,0x01510a9,0x0c93e91,0x3837b64,0x1eca3a9,
        0x105c921,0x05d42e6,0x1379845,0x07ce6f2,0x0e8b6da,0x0e0f093,
        0x220b2cd,0x1f6c041,0x00299f5 },
      { 0x0afdce3,0x2b0e596,0x2f477b6,0x2ccf417,0x3a15206,0x26ec0bf,
        0x2e37e2b,0x2593282,0x0ab9db3,0x2841dd8,0x27954be,0x277a681,
        0x03f82e2,0x2b610c7,0x00446a1 } },
    /* 205 */
    { { 0x06b8195,0x3b3a817,0x31b9c6f,0x317d279,0x3d744a7,0x1de9eb9,
        0x296acc1,0x1ce9ea3,0x06c3587,0x246815d,0x3756736,0x0588518,
        0x1c971a4,0x1fde1f4,0x00aa021 },
      { 0x3fd3226,0x274561d,0x00be61e,0x01393d8,0x30f6f23,0x29b7fc1,
        0x04cebc7,0x0a892a7,0x20109f1,0x27456be,0x0c863ee,0x2eb6c8a,
        0x38c782b,0x039397a,0x00a2829 } },
    /* 206 */
    { { 0x29de330,0x21fe80f,0x145b55b,0x1986570,0x012b260,0x2482fbc,
        0x0536e0a,0x16b7382,0x32c4d19,0x1deffdb,0x145f418,0x0c67a76,
        0x2ce477f,0x218fe24,0x00f9848 },
      { 0x3e37657,0x3f074d3,0x245ad0e,0x20973c3,0x23c58de,0x2c332ef,
        0x2ad21a8,0x0bf1589,0x208af95,0x1f4a8c4,0x2b43735,0x1e46657,
        0x15d4f81,0x0c3e63a,0x005f19d } },
    /* 207 */
    { { 0x26865bb,0x20f6683,0x16a672e,0x0efd8d1,0x222f5af,0x18f2367,
        0x1e9c734,0x25c3902,0x178dfe6,0x2903a79,0x311b91c,0x1adbbe9,
        0x225a387,0x0b3e509,0x0089551 },
      { 0x34e462b,0x23b6a32,0x27c884c,0x129104b,0x384c015,0x3adedc7,
        0x325db1c,0x021dc10,0x1e366f7,0x3054df7,0x1992b9a,0x2824e64,
        0x0ae77f3,0x181b526,0x00a7316 } },
    /* 208 */
    { { 0x2d260f5,0x2434bf2,0x28c0139,0x0a7bb03,0x176c3be,0x3def5f5,
        0x05bee00,0x3692df7,0x3d2efeb,0x3a6f859,0x1122b87,0x38f779a,
        0x1415ccc,0x2c260ad,0x0075a28 },
      { 0x04607a6,0x042f37a,0x3f0df68,0x0a1bd36,0x3c6d581,0x2d36bfa,
        0x2d577d1,0x0a3affa,0x0b2066b,0x2e6f110,0x0b17e84,0x3c76a5e,
        0x1a57553,0x012f36a,0x0004595 } },
    /* 209 */
    { { 0x29e5836,0x0e6808c,0x269d13e,0x147dc5c,0x32c9e7d,0x09b258e,
        0x2c58d6f,0x1efd716,0x0437996,0x34ec31b,0x15908d9,0x2efa8fd,
        0x09ad160,0x079fc1f,0x00d8481 },
      { 0x3d20e4a,0x18269d6,0x3aa8fe7,0x34829c2,0x2e4325d,0x0d800e1,
        0x11f370b,0x10c08dc,0x22fd092,0x1a5fe55,0x0acc443,0x037030d,
        0x1cdd404,0x097379e,0x00fd6d7 } },
    /* 210 */
    { { 0x313eafb,0x3f438f3,0x2e5fb3e,0x2ed6a82,0x121009c,0x240889e,
        0x00c5537,0x269b792,0x334b2fc,0x1dd573c,0x07096ae,0x19296fc,
        0x3813985,0x2742f48,0x00ddd64 },
      { 0x2045041,0x3842c62,0x1572d0d,0x04f255f,0x06e05b4,0x383ec97,
        0x1ff8064,0x18bed71,0x39b6411,0x2764cc5,0x257439f,0x3521217,
        0x172aa42,0x342a2a3,0x0070c5b } },
    /* 211 */
    { { 0x3bdf646,0x1c5ce25,0x1f7ca76,0x2d2acca,0x3aa1485,0x23c97f7,
        0x3e11d6f,0x0609338,0x07ec622,0x01da8ff,0x3392474,0x17ca07f,
        0x13a9a04,0x353a5b4,0x0024557 },
      { 0x14c27cd,0x32012f7,0x3fea875,0x3d03d71,0x211c5f0,0x3157fdf,
        0x0c880bd,0x3c406b2,0x2c51103,0x24ab377,0x399faa8,0x0d06887,
        0x16b5738,0x28b33a7,0x00c7b67 } },
    /* 212 */
    { { 0x2357586,0x35c93e3,0x0da09a0,0x3d77d92,0x11d7f4f,0x37b98a9,
        0x3e6c9bf,0x2cdca70,0x2f00389,0x2412673,0x18eab87,0x0101436,
        0x11617e9,0x06d9b01,0x00e8eef },
      { 0x37e3ca9,0x16ffaf0,0x391debf,0x1b69382,0x07c5e94,0x312fa8a,
        0x0973142,0x2cadde4,0x109ee67,0x3a07db0,0x1afc5ed,0x08df66f,
        0x304c7af,0x0804aae,0x00d2e60 } },
    /* 213 */
    { { 0x24f57bf,0x1818322,0x182a615,0x25bfc44,0x0f97586,0x0a5bbc0,
        0x36773c6,0x1a2660c,0x3ceff66,0x3270152,0x319cd11,0x2845845,
        0x1acfad6,0x19076f8,0x009824a },
      { 0x289fd01,0x2de97ee,0x39d80b7,0x026227d,0x0f8d3b8,0x15e0a17,
        0x21ea08f,0x20a2317,0x136ae6d,0x3deb1d1,0x3521ef5,0x0de8801,
        0x0a25d5d,0x0612c98,0x005ecc4 } },
    /* 214 */
    { { 0x308c8d3,0x3aec669,0x01ecddc,0x13f18fe,0x1e63ed0,0x061cfe5,
        0x05f5a01,0x1db5741,0x14479f2,0x0ced6b5,0x025ae5b,0x09ca8f5,
        0x2160581,0x1404433,0x008bfeb },
      { 0x08228bf,0x0e02722,0x37df423,0x33ecabf,0x34bd82a,0x32f529f,
        0x28f1800,0x0c8f671,0x1246b44,0x1ff35dc,0x091db95,0x303f3da,
        0x28f7f60,0x3624136,0x00cfbb4 } },
    /* 215 */
    { { 0x326139a,0x2977e4e,0x3eb89a6,0x20ecb31,0x13e076a,0x2a592f3,
        0x28e82d5,0x235ad1e,0x239b927,0x262938a,0x2444354,0x141b263,
        0x0d56693,0x2a3fc7