/*
    Copyright (C) 2024 Albin Ahlbäck
    Copyright (C) 2024 Fredrik Johansson

    Uses code adopted from the GNU MPFR Library.

        Copyright 2005-2024 Free Software Foundation, Inc.
        Contributed by the AriC and Caramba projects, INRIA.

    This file is part of FLINT.

    FLINT is free software: you can redistribute it and/or modify it under
    the terms of the GNU Lesser General Public License (LGPL) as published
    by the Free Software Foundation; either version 3 of the License, or
    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
*/

#include <string.h>
#include "mpn_extras.h"

/* Generated by tune-sqrhigh.c */
static const signed short flint_mpn_sqrhigh_k_tab[FLINT_MPN_SQRHIGH_K_TAB_SIZE] = 
{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 29, 0, 0, 31, 0, 31, 
0, 32, 0, 34, 0, 36, 36, 40, 0, 40, 40, 40, 0, 44, 0, 44, 44, 48, 0, 52, 48, 52, 44, 44, 48, 48, 48, 48, 48, 48, 
48, 48, 48, 48, 52, 52, 52, 52, 52, 56, 56, 52, 56, 56, 56, 56, 56, 56, 56, 56, 60, 60, 68, 64, 60, 60, 60, 60, 64, 64, 
64, 68, 68, 68, 64, 64, 68, 68, 72, 72, 68, 68, 68, 72, 76, 80, 72, 72, 72, 88, 76, 76, 80, 80, 76, 80, 76, 80, 84, 80, 
88, 88, 80, 84, 88, 80, 80, 84, 92, 88, 92, 88, 88, 88, 88, 96, 88, 108, 100, 92, 88, 88, 104, 100, 100, 92, 104, 108, 100, 92, 
104, 100, 104, 96, 108, 104, 96, 96, 104, 100, 100, 104, 112, 116, 108, 104, 104, 116, 108, 104, 104, 120, 116, 104, 108, 132, 116, 108, 120, 108, 
108, 108, 132, 108, 120, 112, 112, 116, 132, 128, 116, 124, 128, 116, 132, 120, 132, 120, 124, 120, 132, 120, 124, 128, 120, 128, 128, 132, 144, 124, 
128, 124, 140, 128, 128, 124, 136, 132, 128, 128, 140, 144, 128, 128, 140, 136, 132, 144, 148, 152, 144, 132, 160, 156, 140, 144, 156, 144, 140, 144, 
140, 156, 156, 156, 140, 144, 168, 156, 156, 164, 168, 156, 156, 160, 144, 144, 180, 156, 152, 168, 156, 160, 156, 148, 180, 168, 180, 156, 164, 156, 
156, 172, 156, 156, 156, 180, 180, 172, 180, 168, 164, 172, 164, 172, 176, 176, 168, 176, 172, 176, 180, 168, 176, 180, 180, 180, 192, 184, 180, 176, 
204, 176, 188, 180, 188, 180, 204, 180, 192, 204, 180, 192, 180, 204, 228, 192, 188, 192, 204, 180, 192, 216, 200, 216, 228, 216, 204, 216, 188, 216, 
216, 204, 216, 192, 204, 212, 228, 204, 228, 216, 204, 216, 192, 204, 204, 212, 204, 216, 204, 216, 228, 228, 224, 212, 204, 216, 216, 224, 228, 212, 
216, 228, 212, 204, 204, 216, 216, 216, 228, 216, 220, 224, 228, 220, 228, 228, 236, 224, 260, 224, 228, 228, 260, 228, 248, 252, 264, 248, 216, 212, 
212, 212, 220, 216, 216, 220, 216, 224, 216, 228, 232, 224, 220, 224, 240, 244, 236, 244, 232, 256, 288, 240, 236, 288, 260, 260, 264, 232, 228, 228, 
228, 236, 236, 228, 236, 236, 236, 248, 256, 260, 232, 236, 264, 256, 260, 252, 284, 252, 264, 276, 244, 296, 244, 240, 244, 248, 240, 252, 256, 260, 
260, 252, 252, 272, 252, 272, 260, 296, 268, 260, 256, 276, 272, 264, 312, 284, 256, 252, 252, 252, 264, 252, 256, 260, 264, 276, 292, 268, 264, 276, 
264, 260, 264, 304, 288, 296, 296, 296, 288, 296, 272, 280, 264, 264, 272, 264, 288, 288, 280, 288, 272, 280, 296, 296, 280, 288, 296, 320, 344, 320, 
344, 272, 344, 344, 304, 288, 280, 280, 312, 280, 280, 304, 304, 312, 304, 296, 288, 328, 320, 352, 320, 320, 328, 360, 344, 344, 360, 288, 288, 304, 
288, 296, 320, 320, 312, 312, 304, 328, 336, 312, 312, 360, 336, 344, 344, 336, 360, 360, 296, 296, 304, 360, 328, 328, 312, 320, 320, 328, 312, 344, 
344, 328, 344, 344, 368, 360, 352, 360, 392, 392, 368, 320, 312, 392, 312, 328, 344, 336, 344, 328, 360, 352, 352, 360, 360, 360, 368, 408, 360, 376, 
392, 392, 376, 336, 344, 352, 360, 352, 344, 344, 384, 344, 360, 376, 392, 368, 360, 408, 408, 448, 432, 384, 392, 336, 360, 344, 360, 360, 368, 376, 
376, 360, 368, 408, 368, 376, 376, 376, 432, 376, 384, 464, 432, 344, 432, 376, 344, 344, 384, 344, 384, 376, 400, 432, 456, 432, 456, 392, 432, 392, 
448, 456, 360, 376, 456, 408, 384, 368, 376, 432, 376, 472, 464, 504, 448, 360, 408, 456, 376, 408, 424, 424, 448, 440, 456, 392, 408, 384, 408, 392, 
384, 416, 424, 432, 400, 472, 480, 408, 432, 432, 464, 456, 504, 464, 456, 472, 496, 416, 392, 424, 504, 400, 440, 432, 472, 448, 456, 456, 432, 448, 
456, 504, 504, 512, 496, 504, 424, 416, 408, 432, 544, 432, 440, 456, 448, 448, 464, 472, 480, 552, 552, 544, 552, 560, 552, 544, 544, 544, 552, 552, 
552, 552, 552, 552, 552, 560, 552, 552, 560, 560, 560, 560, 568, 576, 568, 592, 576, 560, 560, 560, 560, 568, 568, 576, 568, 568, 576, 568, 568, 576, 
568, 568, 600, 592, 576, 608, 576, 576, 576, 576, 584, 584, 576, 592, 592, 600, 584, 592, 584, 600, 600, 600, 584, 624, 584, 592, 600, 592, 616, 592, 
592, 592, 592, 592, 592, 592, 600, 608, 600, 608, 600, 600, 600, 624, 616, 608, 616, 600, 632, 608, 608, 608, 616, 608, 608, 608, 608, 616, 608, 664, 
664, 632, 656, 616, 664, 624, 616, 632, 632, 616, 624, 624, 624, 632, 624, 624, 624, 624, 648, 632, 624, 624, 872, 872, 656, 664, 696, 872, 872, 872, 
632, 872, 872, 896, 872, 896, 872, 896, 880, 872, 896, 872, 888, 872, 872, 872, 872, 872, 888, 872, 880, 824, 872, 856, 880, 888, 800, 848, 800, 880, 
848, 800, 808, 872, 872, 864, 824, 840, 872, 872, 872, 872, 872, 872, 872, 872, 872, 872, 880, 928, 872, 920, 872, 920, 888, 880, 872, 880, 872, 872, 
872, 888, 880, 888, 888, 872, 888, 872, 880, 896, 920, 920, 880, 928, 904, 872, 888, 904, 872, 872, 872, 872, 880, 888, 872, 872, 872, 880, 872, 904, 
920, 888, 888, 872, 880, 880, 888, 896, 896, 880, 872, 904, 880, 912, 896, 872, 904, 904, 912, 880, 880, 920, 912, 896, 928, 928, 872, 872, 872, 872, 
880, 880, 888, 888, 888, 912, 880, 880, 880, 912, 896, 896, 928, 896, 912, 912, 896, 928, 928, 928, 928, 912, 880, 928, 880, 880, 896, 880, 896, 880, 
896, 880, 880, 912, 912, 880, 896, 880, 896, 912, 880, 880, 928, 912, 912, 928, 928, 896, 928, 912, 928, 928, 928, 928, 928, 928, 880, 880, 880, 880, 
880, 912, 896, 912, 912, 896, 928, 896, 928, 896, 928, 928, 928, 928, 912, 880, 928, 912, 896, 928, 896, 880, 912, 896, 912, 896, 896, 896, 928, 896, 
912, 928, 912, 928, 912, 928, 912, 880, 928, 928, 880, 928, 928, 880, 896, 928, 880, 896, 912, 928, 928, 896, 912, 928, 912, 928, 1024, 928, 928, 1040, 
912, 928, 1024, 1024, 928, 928, 928, 928, 912, 1040, 1072, 912, 1072, 1072, 1056, 1040, 1072, 1088, 1104, 1040, 1040, 928, 1024, 928, 928, 1056, 928, 928, 1056, 1040, 
928, 1056, 1024, 1056, 1056, 1056, 1104, 1024, 1056, 1072, 1056, 1056, 1088, 1072, 1120, 1104, 1072, 1072, 1072, 1104, 1120, 1024, 1120, 1120, 1056, 1024, 1040, 1024, 1040, 1040, 
1040, 1024, 1040, 1056, 1040, 1088, 1040, 1056, 1120, 1040, 1040, 1072, 1072, 1088, 1104, 1088, 1104, 1088, 1088, 1056, 1120, 1104, 1152, 1104, 1088, 1120, 1104, 1120, 1024, 1120, 
1152, 1136, 1152, 1056, 1056, 1072, 1056, 1152, 1088, 1056, 1104, 1072, 1104, 1088, 1120, 1072, 1104, 1088, 1088, 1120, 1120, 1120, 1152, 1136, 1120, 1120, 1152, 1120, 1136, 1152, 
1152, 1072, 1136, 1136, 1088, 1152, 1136, 1136, 1104, 1056, 1072, 1088, 1104, 1104, 1120, 1120, 1104, 1104, 1104, 1136, 1152, 1136, 1152, 1136, 1136, 1152, 1152, 1152, 1136, 1072, 
1152, 1136, 1152, 1152, 1104, 1152, 1104, 1152, 1152, 1088, 1120, 1152, 1136, 1136, 1120, 1136, 1120, 1152, 1136, 1136, 1120, 1136, 1152, 1136, 1152, 1136, 1136, 1120, 1152, 1152, 
1104, 1152, 1152, 1152, 1104, 1120, 1104, 1120, 1120, 1120, 1120, 1136, 1136, 1136, 1136, 1152, 1120, 1120, 1152, 1152, 1152, 1136, 1152, 1136, 1136, 1152, 1120, 1152, 1152, 1104, 
1120, 1152, 1136, 1136, 1136, 1136, 1136, 1136, 1120, 1152, 1152, 1152, 1136, 1104, 1152, 1136, 1136, 1152, 1136, 1120, 1136, 1152, 1152, 1136, 1136, 1120, 1136, 1136, 1136, 1120, 
1152, 1152, 1152, 1136, 1136, 1136, 1152, 1104, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1136, 1136, 1136, 1136, 1152, 1136, 1136, 1152, 1136, 1152, 1152, 1136, 1120, 1152, 
1136, 1136, 1152, 1136, 1152, 1152, 1152, 1152, 1136, 1136, 1136, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1136, 1152, 1152, 1136, 1136, 1136, 1136, 1152, 1152, 1152, 1152, 1152, 
1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1136, 1152, 1152, 1136, 1152, 1136, 1136, 1152, 1120, 1152, 1152, 1152, 1152, 1136, 1136, 1152, 1136, 1152, 
1120, 1152, 1136, 1136, 1152, 1152, 1152, 1152, 1152, 1152, 1136, 1152, 1136, 1152, 1152, 1136, 1152, 1152, 1152, 1136, 1136, 1152, 1136, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 
1136, 1152, 1152, 1152, 1152, 1152, 1136, 1136, 1152, 1136, 1152, 1120, 1152, 1152, 1152, 1136, 1152, 1136, 1152, 1152, 1136, 1152, 1136, 1136, 1136, 1136, 1152, 1152, 1152, 1136, 
1152, 1152, 1152, 1104, 1120, 1152, 1136, 1152, 1152, 1136, 1152, 1136, 1152, 1152, 1152, 1136, 1136, 1152, 1152, 1120, 1152, 1136, 1152, 1152, 1120, 1152, 1120, 1136, 1152, 1152, 
1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1152, 1120, 1152, 1152, 1136, 1152, 1136, 1552, 1552, 1136, 1136, 1152, 1552, 1136, 1136, 1584, 1552, 1552, 1552, 
1152, 1552, 1552, 1152, 1584, 1552, 1152, 1152, 1552, 1584, 1152, 1152, 1552, 1584, 1568, 1552, 1552, 1552, 1152, 1552, 1552, 1552, 1552, 1552, 1552, 1552, 1552, 1552, 1552, 1552, 
1552, 1552, 1552, 1552, 1552, 1625, 1552, 1552, 1552, 1552, 1552, 1552, 1552, 1568, 1632, 1552, 1632, 1568, 1584, 1568, 1568, 1584, 1600, 1552, 1552, 1632, 1600, 1632, 1632, 1632, 
1632, 1568, 1584, 1632, 1632, 1632, 1632, 1632, 1632, 1632, 1632, 1552, 1662, 1632, 1632, 1632, 1648, 1632, 1632, 1632, 1568, 1600, 1632, 1552, 1616, 1632, 1632, 1632, 1584, 1632, 
1632, 1552, 1632, 1632, 1632, 1568, 1600, 1632, 1648, 1616, 1632, 1648, 1632, 1632, 1632, 1632, 1648, 1632, 1632, 1632, 1632, 1632, 1632, 1632, 1680, 1664, 1632, 1664, 1632, 1632, 
1632, 1632, 1632, 1632, 1632, 1648, 1648, 1648, 1616, 1632, 1680, 1632, 1680, 1648, 1632, 1664, 1632, 1648, 1632, 1632, 1632, 1632, 1632, 1664, 1632, 1632, 1632, 1632, 1632, 1632, 
1648, 1648, 1632, 1632, 1632, 1632, 1632, 1632, 1632, 1664, 1632, 1648, 1664, 1648, 1648, 1632, 1648, 1632, 1680, 1664, 1648, 1680, 1664, 1696, 1632, 1664, 1680, 1632, 1648, 1696, 
1680, 1632, 1632, 1648, 1648, 1648, 1632, 1664, 1632, 1648, 1648, 1632, 1648, 1648, 1632, 1664, 1664, 1680, 1680, 1632, 1664, 1664, 1648, 1648, 1712, 1728, 1664, 1712, 1696, 1712, 
1680, 1632, 1728, 1648, 1728, 1728, 1712, 1648, 1632, 1632, 1664, 1648, 1664, 1664, 1680, 1712, 1664, 1632, 1696, 1664, 1712, 1696, 1680, 1680, 1696, 1680, 1680, 1712, 1648, 1712, 
1632, 1728, 1696, 1648, 1712, 1632, 1712, 1696, 1648, 1680, 1648, 1664, 1648, 1696, 1712, 1648, 1648, 1680, 1664, 1696, 1728, 1696, 1712, 1728, 1696, 1728, 1712, 1664, 1680, 1728, 
1712, 1712, 1712, 1728, 1728, 1712, 1728, 1696, 1728, 1712, 1696, 1728, 1728, 1680, 1728, 1712, 1664, 1680, 1680, 1696, 1712, 1696, 1696, 1696, 1728, 1696, 1728, 1728, 1696, 1712, 
1712, 1664, 1712, 1680, 1664, 1728, 1728, 1664, 1696, 1696, 1680, 1712, 1680, 1712, 1696, 1728, 1696, 1696, 1728, 1696, 1712, 1696, 1712, 1712, 1712, 1728, 1712, 1696, 1728, 1680, 
1696, 1712, 1712, 1728, 1712, 1728, 1696, 1728, 1776, 1728, 1696, 1776, 1728, 1728, 1712, 1712, 1824, 1856, 1728, 1824, 1728, 1728, 1712, 1728, 1728, 1728, 1856, 1696, 1728, 1840, 
1712, 1712, 1824, 1856, 1792, 1712, 1840, 1728, 1808, 1728, 1824, 1824, 1840, 1824, 1824, 1856, 1856, 1824, 1856, 1840, 1696, 1856, 1840, 1840, 1856, 1856, 1824, 1712, 1792, 1856, 
1824, 1728, 1808, 1792, 1856, 1728, 1792, 1840, 1808, 1808, 1712, 1808, 1840, 1808, 1824, 1824, 1824, 1840, 1824, 1840, 1856, 1824, 1728, 1856, 1856, 1824, 1856, 1856, 1792, 1792, 
1856, 1824, 1856, 1824, 1824, 1856, 1808, 1824, 1856, 1856, 1840, 1840, 1840, 1840, 1856, 1840, 1840, 1856, 1824, 1840, 1808, 1824, 1840, 1856, 1856, 1824, 1856, 1840, 1840, 1840, 
1824, 1824, 1840, 1840, 1840, 1856, 1856, 1856, };

void
_flint_mpn_sqrhigh_mulders_recursive(mp_ptr rp, mp_srcptr np, mp_size_t n)
{
    mp_limb_t cy;
    mp_size_t l;
    slong k;

    if (FLINT_HAVE_SQRHIGH_FUNC(n))
    {
        rp[n - 1] = flint_mpn_sqrhigh_func_tab[n](rp + n, np);
        return;
    }

    if (n < FLINT_MPN_SQRHIGH_K_TAB_SIZE)
        k = flint_mpn_sqrhigh_k_tab[n];
    else
        k = (n + 4) / 2;

    if (k == 0)
    {
        rp[n - 1] = _flint_mpn_sqrhigh_basecase(rp + n, np, n);
        return;
    }

    FLINT_ASSERT(k >= (n + 4) / 2);

    if (k == n)
    {
        flint_mpn_sqr(rp, np, n);
        return;
    }

    l = n - k;

    flint_mpn_sqr(rp + 2 * l, np + l, k);            /* fills rp[2l..2n-1] */
    _flint_mpn_mulhigh_n_mulders_recursive(rp, np, np + k, l);         /* fills rp[l-1..2l-1] */
    /* {rp+n-1,l+1} += 2 * {rp+l-1,l+1} */
    cy = mpn_lshift(rp + l - 1, rp + l - 1, l + 1, 1);
    cy += mpn_add_n(rp + n - 1, rp + n - 1, rp + l - 1, l + 1);
    MPN_INCR_U(rp + n + l, k, cy); /* propagate carry */

    /* Corrections for precise high product (not needed for the
       original Mulders). */
    {
        mp_limb_t hi, lo;

        /* Note: if we relax the condition on k, we will need
           a branch for k == l here to avoid double counting. */
        FLINT_ASSERT(k != l);

        umul_ppmm(hi, lo, np[k - 1], np[l - 1]);
        MPN_INCR_U(rp + n - 1, n + 1, hi);
        MPN_INCR_U(rp + n - 1, n + 1, hi);
    }
}

mp_limb_t
_flint_mpn_sqrhigh_mulders(mp_ptr res, mp_srcptr u, mp_size_t n)
{
    mp_ptr tmp;
    mp_limb_t bot;
    TMP_INIT;

    TMP_START;
    tmp = TMP_ALLOC(sizeof(mp_limb_t) * (2 * n));
    _flint_mpn_sqrhigh_mulders_recursive(tmp, u, n);
    memcpy(res, tmp + n, sizeof(mp_limb_t) * n);
    bot = tmp[n - 1];

    TMP_END;
    return bot;
}

mp_limb_t
_flint_mpn_sqrhigh_sqr(mp_ptr res, mp_srcptr u, mp_size_t n)
{
    mp_ptr tmp;
    mp_limb_t bot;
    tmp = flint_malloc(sizeof(mp_limb_t) * (2 * n));
    flint_mpn_sqr(tmp, u, n);
    memcpy(res, tmp + n, sizeof(mp_limb_t) * n);
    bot = tmp[n - 1];
    flint_free(tmp);
    return bot;
}

mp_limb_t
_flint_mpn_sqrhigh(mp_ptr res, mp_srcptr u, mp_size_t n)
{
    if (n <= FLINT_MPN_SQRHIGH_MULDERS_CUTOFF)
        return _flint_mpn_sqrhigh_basecase(res, u, n);
    else if (n <= FLINT_MPN_SQRHIGH_SQR_CUTOFF)
        return _flint_mpn_sqrhigh_mulders(res, u, n);
    else
        return _flint_mpn_sqrhigh_sqr(res, u, n);
}

mp_limb_pair_t _flint_mpn_sqrhigh_normalised(mp_ptr rp, mp_srcptr xp, mp_size_t n)
{
    mp_limb_pair_t ret;

    FLINT_ASSERT(n >= 1);
    FLINT_ASSERT(rp != xp);

    ret.m1 = flint_mpn_sqrhigh(rp, xp, n);

    if (rp[n - 1] >> (FLINT_BITS - 1))
    {
        ret.m2 = 0;
    }
    else
    {
        ret.m2 = 1;
        mpn_lshift(rp, rp, n, 1);
        rp[0] |= (ret.m1 >> (FLINT_BITS - 1));
        ret.m1 <<= 1;
    }

    return ret;
}
