/* SPDX-License-Identifier: GPL-2.0-only OR GPL-3.0-only */
/* Copyright (c) 2022-2025 Brett Sheffield <bacs@librecast.net> */

#include <matrix.h>
#include <gf256.h>

/*
 * Method adapted from the technique described in:
 * J. S. Plank and K. M. Greenan and E. L. Miller (2013)
 * "Screaming Fast Galois Field Arithmetic Using Intel SIMD Instructions"
 * http://web.eecs.utk.edu/~jplank/plank/papers/FAST-2013-GF.html
 */
static __m128i mul_128(const __m128i *t1, const __m128i *t2, __m128i A)
{
	__m128i mask1 = _mm_set1_epi8((uint8_t)0x0f);
	__m128i mask2 = _mm_set1_epi8((uint8_t)0xf0);
	__m128i l, h;
	l = _mm_and_si128(A, mask1);
	l = _mm_shuffle_epi8(*t1, l);
	h = _mm_and_si128(A, mask2);
	h = _mm_srli_epi64(h, 4);
	h = _mm_shuffle_epi8(*t2, h);
	return _mm_xor_si128(h, l);
}

static void mul_128_inplace(const __m128i *t1, const __m128i *t2, uint8_t *d)
{
	__m128i D = _mm_loadu_si128((const __m128i_u *)d);
	D = mul_128(t1, t2, D);
	_mm_storeu_si128((__m128i*)d, D);
}

void matrix_row_mul_ssse3(matrix_t *m, const int row, const int off, const uint8_t y)
{
	const __m128i t1 = _mm_loadu_si128((const __m128i_u *)GF256LR[y][0]);
	const __m128i t2 = _mm_loadu_si128((const __m128i_u *)GF256LR[y][1]);
	uint8_t *d = matrix_ptr_row(m, row) + off;
	const int max = m->cols - off;
	const int mod = max % 16;
	const int maxv = max - mod;
	int j = 0;
	for (; j < maxv; j += 16) {
		mul_128_inplace(&t1, &t2, &d[j]);
	}
	for (; j < max; j++) {
		d[j] = GF256MUL(d[j], y);
	}
}

void matrix_row_mul_byrow_ssse3(matrix_t *m, const int rdst, const int off, const int rsrc, const uint8_t y)
{
	assert(y);
	const __m128i t1 = _mm_loadu_si128((const __m128i_u *)GF256LR[y][0]);
	const __m128i t2 = _mm_loadu_si128((const __m128i_u *)GF256LR[y][1]);
	uint8_t *d = matrix_ptr_row(m, rdst) + off;
	uint8_t *s = matrix_ptr_row(m, rsrc) + off;
	const int max = m->cols - off;
	const int mod = max % 16;
	const int maxv = max - mod;
	int i;
	for (i = 0; i < maxv; i += 16) {
		__m128i S = _mm_loadu_si128((const __m128i_u *)&s[i]);
		__m128i D = _mm_loadu_si128((const __m128i_u *)&d[i]);
		S = mul_128(&t1, &t2, S);
		D = _mm_xor_si128(D, S);
		_mm_storeu_si128((__m128i*)&d[i], D);
	}
	for (; i < max; i++) {
		d[i] ^= GF256MUL(s[i], y);
	}
}
