/**
 *  \file src/MBCSR/MatMult/1x8.c
 *  \brief MBCSR 1x8 SpMV implementation, for all transpose options.
 *  \ingroup MATTYPE_MBCSR
 *
 *  Automatically generated by ./gen_symm.sh on Wed Jun  8 15:57:16 PDT 2005.
 */

#if HAVE_CONFIG_H
#include <config/config.h> /* for 'restrict' keyword */
#endif

#include <assert.h>

#include <oski/common.h>
#include <oski/mangle.h>
#include <oski/vecview.h>
#include <oski/MBCSR/format.h>
#include <oski/MBCSR/module.h>

#if IS_VAL_COMPLEX
/** Complex-valued, so do not use explicit 'register' keyword */
#define REGISTER
#else
/** Real-valued, so use explicit 'register' keyword */
#define REGISTER register
#endif



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1 */
#define MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1 MANGLE_MOD_(MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1_1x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot A\cdot x\f$.
 */
void
MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x,
	oski_value_t* restrict y )
{
	oski_value_t* yp = y + d0;
	const oski_value_t* xp = x + d0;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 1, xp += 1 )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _x0;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_MUL( _x0, alpha, xp[0] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 1*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0;
			oski_value_t* ypp = y + j0;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1] );
			VAL_ASSIGN( _xp2, xpp[2] );
			VAL_ASSIGN( _xp3, xpp[3] );
			VAL_ASSIGN( _xp4, xpp[4] );
			VAL_ASSIGN( _xp5, xpp[5] );
			VAL_ASSIGN( _xp6, xpp[6] );
			VAL_ASSIGN( _xp7, xpp[7] );
			VAL_MAC( _yp0, bval[0], _x0 );
			VAL_MAC( _yp1, bval[1], _x0 );
			VAL_MAC( _yp2, bval[2], _x0 );
			VAL_MAC( _yp3, bval[3], _x0 );
			VAL_MAC( _yp4, bval[4], _x0 );
			VAL_MAC( _yp5, bval[5], _x0 );
			VAL_MAC( _yp6, bval[6], _x0 );
			VAL_MAC( _yp7, bval[7], _x0 );
			VAL_MAC( _y0, bval[0], _xp0 );
			VAL_MAC( _y0, bval[1], _xp1 );
			VAL_MAC( _y0, bval[2], _xp2 );
			VAL_MAC( _y0, bval[3], _xp3 );
			VAL_MAC( _y0, bval[4], _xp4 );
			VAL_MAC( _y0, bval[5], _xp5 );
			VAL_MAC( _y0, bval[6], _xp6 );
			VAL_MAC( _y0, bval[7], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1], _yp1 );
			VAL_INC( ypp[2], _yp2 );
			VAL_INC( ypp[3], _yp3 );
			VAL_INC( ypp[4], _yp4 );
			VAL_INC( ypp[5], _yp5 );
			VAL_INC( ypp[6], _yp6 );
			VAL_INC( ypp[7], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
	}
	/* Diagonal block multiply */
	yp = y + d0;
	xp = x + d0;
	for( I = 0; I < M; I++, bdiag += 1*1, yp += 1, xp += 1 )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _y0;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_SET_ZERO( _y0 );
		VAL_MAC( _y0, bdiag[0], _x0 );
		VAL_MAC( yp[0], alpha, _y0 );
	}
}


#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX */
#define MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX MANGLE_MOD_(MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX_1x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot A\cdot x\f$.
 */
void
MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x,
	oski_value_t* restrict y, oski_index_t incy )
{
	oski_value_t* yp = y + d0*incy;
	const oski_value_t* xp = x + d0;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 1*incy, xp += 1 )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _x0;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_MUL( _x0, alpha, xp[0] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 1*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0;
			oski_value_t* ypp = y + j0*incy;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1] );
			VAL_ASSIGN( _xp2, xpp[2] );
			VAL_ASSIGN( _xp3, xpp[3] );
			VAL_ASSIGN( _xp4, xpp[4] );
			VAL_ASSIGN( _xp5, xpp[5] );
			VAL_ASSIGN( _xp6, xpp[6] );
			VAL_ASSIGN( _xp7, xpp[7] );
			VAL_MAC( _yp0, bval[0], _x0 );
			VAL_MAC( _yp1, bval[1], _x0 );
			VAL_MAC( _yp2, bval[2], _x0 );
			VAL_MAC( _yp3, bval[3], _x0 );
			VAL_MAC( _yp4, bval[4], _x0 );
			VAL_MAC( _yp5, bval[5], _x0 );
			VAL_MAC( _yp6, bval[6], _x0 );
			VAL_MAC( _yp7, bval[7], _x0 );
			VAL_MAC( _y0, bval[0], _xp0 );
			VAL_MAC( _y0, bval[1], _xp1 );
			VAL_MAC( _y0, bval[2], _xp2 );
			VAL_MAC( _y0, bval[3], _xp3 );
			VAL_MAC( _y0, bval[4], _xp4 );
			VAL_MAC( _y0, bval[5], _xp5 );
			VAL_MAC( _y0, bval[6], _xp6 );
			VAL_MAC( _y0, bval[7], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1*incy], _yp1 );
			VAL_INC( ypp[2*incy], _yp2 );
			VAL_INC( ypp[3*incy], _yp3 );
			VAL_INC( ypp[4*incy], _yp4 );
			VAL_INC( ypp[5*incy], _yp5 );
			VAL_INC( ypp[6*incy], _yp6 );
			VAL_INC( ypp[7*incy], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
	}
	/* Diagonal block multiply */
	yp = y + d0*incy;
	xp = x + d0;
	for( I = 0; I < M; I++, bdiag += 1*1, yp += 1*incy, xp += 1 )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _y0;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_SET_ZERO( _y0 );
		VAL_MAC( _y0, bdiag[0], _x0 );
		VAL_MAC( yp[0], alpha, _y0 );
	}
}


#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1 */
#define MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1 MANGLE_MOD_(MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1_1x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot A\cdot x\f$.
 */
void
MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x, oski_index_t incx,
	oski_value_t* restrict y )
{
	oski_value_t* yp = y + d0;
	const oski_value_t* xp = x + d0*incx;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 1, xp += 1*incx )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _x0;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_MUL( _x0, alpha, xp[0] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 1*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0*incx;
			oski_value_t* ypp = y + j0;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1*incx] );
			VAL_ASSIGN( _xp2, xpp[2*incx] );
			VAL_ASSIGN( _xp3, xpp[3*incx] );
			VAL_ASSIGN( _xp4, xpp[4*incx] );
			VAL_ASSIGN( _xp5, xpp[5*incx] );
			VAL_ASSIGN( _xp6, xpp[6*incx] );
			VAL_ASSIGN( _xp7, xpp[7*incx] );
			VAL_MAC( _yp0, bval[0], _x0 );
			VAL_MAC( _yp1, bval[1], _x0 );
			VAL_MAC( _yp2, bval[2], _x0 );
			VAL_MAC( _yp3, bval[3], _x0 );
			VAL_MAC( _yp4, bval[4], _x0 );
			VAL_MAC( _yp5, bval[5], _x0 );
			VAL_MAC( _yp6, bval[6], _x0 );
			VAL_MAC( _yp7, bval[7], _x0 );
			VAL_MAC( _y0, bval[0], _xp0 );
			VAL_MAC( _y0, bval[1], _xp1 );
			VAL_MAC( _y0, bval[2], _xp2 );
			VAL_MAC( _y0, bval[3], _xp3 );
			VAL_MAC( _y0, bval[4], _xp4 );
			VAL_MAC( _y0, bval[5], _xp5 );
			VAL_MAC( _y0, bval[6], _xp6 );
			VAL_MAC( _y0, bval[7], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1], _yp1 );
			VAL_INC( ypp[2], _yp2 );
			VAL_INC( ypp[3], _yp3 );
			VAL_INC( ypp[4], _yp4 );
			VAL_INC( ypp[5], _yp5 );
			VAL_INC( ypp[6], _yp6 );
			VAL_INC( ypp[7], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
	}
	/* Diagonal block multiply */
	yp = y + d0;
	xp = x + d0*incx;
	for( I = 0; I < M; I++, bdiag += 1*1, yp += 1, xp += 1*incx )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _y0;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_SET_ZERO( _y0 );
		VAL_MAC( _y0, bdiag[0], _x0 );
		VAL_MAC( yp[0], alpha, _y0 );
	}
}


#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX */
#define MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX MANGLE_MOD_(MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX_1x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot A\cdot x\f$.
 */
void
MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x, oski_index_t incx,
	oski_value_t* restrict y, oski_index_t incy )
{
	oski_value_t* yp = y + d0*incy;
	const oski_value_t* xp = x + d0*incx;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 1*incy, xp += 1*incx )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _x0;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_MUL( _x0, alpha, xp[0] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 1*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0*incx;
			oski_value_t* ypp = y + j0*incy;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1*incx] );
			VAL_ASSIGN( _xp2, xpp[2*incx] );
			VAL_ASSIGN( _xp3, xpp[3*incx] );
			VAL_ASSIGN( _xp4, xpp[4*incx] );
			VAL_ASSIGN( _xp5, xpp[5*incx] );
			VAL_ASSIGN( _xp6, xpp[6*incx] );
			VAL_ASSIGN( _xp7, xpp[7*incx] );
			VAL_MAC( _yp0, bval[0], _x0 );
			VAL_MAC( _yp1, bval[1], _x0 );
			VAL_MAC( _yp2, bval[2], _x0 );
			VAL_MAC( _yp3, bval[3], _x0 );
			VAL_MAC( _yp4, bval[4], _x0 );
			VAL_MAC( _yp5, bval[5], _x0 );
			VAL_MAC( _yp6, bval[6], _x0 );
			VAL_MAC( _yp7, bval[7], _x0 );
			VAL_MAC( _y0, bval[0], _xp0 );
			VAL_MAC( _y0, bval[1], _xp1 );
			VAL_MAC( _y0, bval[2], _xp2 );
			VAL_MAC( _y0, bval[3], _xp3 );
			VAL_MAC( _y0, bval[4], _xp4 );
			VAL_MAC( _y0, bval[5], _xp5 );
			VAL_MAC( _y0, bval[6], _xp6 );
			VAL_MAC( _y0, bval[7], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1*incy], _yp1 );
			VAL_INC( ypp[2*incy], _yp2 );
			VAL_INC( ypp[3*incy], _yp3 );
			VAL_INC( ypp[4*incy], _yp4 );
			VAL_INC( ypp[5*incy], _yp5 );
			VAL_INC( ypp[6*incy], _yp6 );
			VAL_INC( ypp[7*incy], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
	}
	/* Diagonal block multiply */
	yp = y + d0*incy;
	xp = x + d0*incx;
	for( I = 0; I < M; I++, bdiag += 1*1, yp += 1*incy, xp += 1*incx )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _y0;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_SET_ZERO( _y0 );
		VAL_MAC( _y0, bdiag[0], _x0 );
		VAL_MAC( yp[0], alpha, _y0 );
	}
}


/**
 *  \brief Symmetric matrix times single-vector multiply in the normal case.
 */
static void
SymmMatMult_v1( oski_index_t M, oski_index_t d0,
	const oski_index_t* bptr, const oski_index_t* bind,
	const oski_value_t* bval, const oski_value_t* bdiag,
	oski_value_t alpha, const oski_value_t* x, oski_index_t incx,
	oski_value_t* y, oski_index_t incy )
{
	if( incx == 1 ) {
		if( incy == 1 ) {
			MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1( M, d0,
				bptr, bind, bval, bdiag, alpha, x, y );
		} else { /* general incy */
			MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX( M, d0,
				bptr, bind, bval, bdiag, alpha, x, y, incy );
		}
	} else { /* general incx */
		if( incy == 1 ) {
			MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1( M, d0,
				bptr, bind, bval, bdiag, alpha, x, incx, y );
		} else { /* general incy */
			MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX( M, d0,
				bptr, bind, bval, bdiag, alpha, x, incx, y, incy );
		}
	}
}


/**
 *  \brief Computes
 *  \f$y \leftarrow y + \alpha\cdot\mathrm{op}(A)\cdot x\f$,
 *  where \f$\mathrm{op}(A) = A\f$, on the fully blocked
 *  portion of \f$A\f$.
 */
static int
SymmMatMult( const oski_submatMBCSR_t* A,
	oski_value_t alpha, const oski_vecview_t x_view,
	oski_vecview_t y_view )
{
	oski_index_t j; /* column number */
	const oski_value_t* xpj; /* X(:, j) */
	oski_value_t* ypj; /* Y(:, j) */

	assert( A->r == 1 );
	assert( A->c == 8 );

	for( j = 0, xpj = x_view->val, ypj = y_view->val;
		j < x_view->num_cols;
		j++, xpj += x_view->colinc, ypj += y_view->colinc )
	{
		SymmMatMult_v1( A->num_block_rows, A->offset,
			A->bptr, A->bind, A->bval, A->bdiag,
			alpha, xpj, x_view->rowinc, ypj, y_view->rowinc );
	}

	return 0;
}


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1(). */
#define MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ys1 MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ys1 */
#define MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ys1 MANGLE_MOD_(MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ys1_1x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot \bar{A}\cdot x\f$.
 */
void
MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ys1( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x,
	oski_value_t* restrict y )
{
	oski_value_t* yp = y + d0;
	const oski_value_t* xp = x + d0;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 1, xp += 1 )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _x0;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_MUL( _x0, alpha, xp[0] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 1*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0;
			oski_value_t* ypp = y + j0;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1] );
			VAL_ASSIGN( _xp2, xpp[2] );
			VAL_ASSIGN( _xp3, xpp[3] );
			VAL_ASSIGN( _xp4, xpp[4] );
			VAL_ASSIGN( _xp5, xpp[5] );
			VAL_ASSIGN( _xp6, xpp[6] );
			VAL_ASSIGN( _xp7, xpp[7] );
			VAL_MAC_CONJ( _yp0, bval[0], _x0 );
			VAL_MAC_CONJ( _yp1, bval[1], _x0 );
			VAL_MAC_CONJ( _yp2, bval[2], _x0 );
			VAL_MAC_CONJ( _yp3, bval[3], _x0 );
			VAL_MAC_CONJ( _yp4, bval[4], _x0 );
			VAL_MAC_CONJ( _yp5, bval[5], _x0 );
			VAL_MAC_CONJ( _yp6, bval[6], _x0 );
			VAL_MAC_CONJ( _yp7, bval[7], _x0 );
			VAL_MAC_CONJ( _y0, bval[0], _xp0 );
			VAL_MAC_CONJ( _y0, bval[1], _xp1 );
			VAL_MAC_CONJ( _y0, bval[2], _xp2 );
			VAL_MAC_CONJ( _y0, bval[3], _xp3 );
			VAL_MAC_CONJ( _y0, bval[4], _xp4 );
			VAL_MAC_CONJ( _y0, bval[5], _xp5 );
			VAL_MAC_CONJ( _y0, bval[6], _xp6 );
			VAL_MAC_CONJ( _y0, bval[7], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1], _yp1 );
			VAL_INC( ypp[2], _yp2 );
			VAL_INC( ypp[3], _yp3 );
			VAL_INC( ypp[4], _yp4 );
			VAL_INC( ypp[5], _yp5 );
			VAL_INC( ypp[6], _yp6 );
			VAL_INC( ypp[7], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
	}
	/* Diagonal block multiply */
	yp = y + d0;
	xp = x + d0;
	for( I = 0; I < M; I++, bdiag += 1*1, yp += 1, xp += 1 )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _y0;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_SET_ZERO( _y0 );
		VAL_MAC_CONJ( _y0, bdiag[0], _x0 );
		VAL_MAC( yp[0], alpha, _y0 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX(). */
#define MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ysX MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ysX */
#define MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ysX MANGLE_MOD_(MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ysX_1x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot \bar{A}\cdot x\f$.
 */
void
MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ysX( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x,
	oski_value_t* restrict y, oski_index_t incy )
{
	oski_value_t* yp = y + d0*incy;
	const oski_value_t* xp = x + d0;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 1*incy, xp += 1 )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _x0;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_MUL( _x0, alpha, xp[0] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 1*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0;
			oski_value_t* ypp = y + j0*incy;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1] );
			VAL_ASSIGN( _xp2, xpp[2] );
			VAL_ASSIGN( _xp3, xpp[3] );
			VAL_ASSIGN( _xp4, xpp[4] );
			VAL_ASSIGN( _xp5, xpp[5] );
			VAL_ASSIGN( _xp6, xpp[6] );
			VAL_ASSIGN( _xp7, xpp[7] );
			VAL_MAC_CONJ( _yp0, bval[0], _x0 );
			VAL_MAC_CONJ( _yp1, bval[1], _x0 );
			VAL_MAC_CONJ( _yp2, bval[2], _x0 );
			VAL_MAC_CONJ( _yp3, bval[3], _x0 );
			VAL_MAC_CONJ( _yp4, bval[4], _x0 );
			VAL_MAC_CONJ( _yp5, bval[5], _x0 );
			VAL_MAC_CONJ( _yp6, bval[6], _x0 );
			VAL_MAC_CONJ( _yp7, bval[7], _x0 );
			VAL_MAC_CONJ( _y0, bval[0], _xp0 );
			VAL_MAC_CONJ( _y0, bval[1], _xp1 );
			VAL_MAC_CONJ( _y0, bval[2], _xp2 );
			VAL_MAC_CONJ( _y0, bval[3], _xp3 );
			VAL_MAC_CONJ( _y0, bval[4], _xp4 );
			VAL_MAC_CONJ( _y0, bval[5], _xp5 );
			VAL_MAC_CONJ( _y0, bval[6], _xp6 );
			VAL_MAC_CONJ( _y0, bval[7], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1*incy], _yp1 );
			VAL_INC( ypp[2*incy], _yp2 );
			VAL_INC( ypp[3*incy], _yp3 );
			VAL_INC( ypp[4*incy], _yp4 );
			VAL_INC( ypp[5*incy], _yp5 );
			VAL_INC( ypp[6*incy], _yp6 );
			VAL_INC( ypp[7*incy], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
	}
	/* Diagonal block multiply */
	yp = y + d0*incy;
	xp = x + d0;
	for( I = 0; I < M; I++, bdiag += 1*1, yp += 1*incy, xp += 1 )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _y0;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_SET_ZERO( _y0 );
		VAL_MAC_CONJ( _y0, bdiag[0], _x0 );
		VAL_MAC( yp[0], alpha, _y0 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1(). */
#define MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ys1 MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ys1 */
#define MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ys1 MANGLE_MOD_(MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ys1_1x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot \bar{A}\cdot x\f$.
 */
void
MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ys1( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x, oski_index_t incx,
	oski_value_t* restrict y )
{
	oski_value_t* yp = y + d0;
	const oski_value_t* xp = x + d0*incx;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 1, xp += 1*incx )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _x0;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_MUL( _x0, alpha, xp[0] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 1*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0*incx;
			oski_value_t* ypp = y + j0;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1*incx] );
			VAL_ASSIGN( _xp2, xpp[2*incx] );
			VAL_ASSIGN( _xp3, xpp[3*incx] );
			VAL_ASSIGN( _xp4, xpp[4*incx] );
			VAL_ASSIGN( _xp5, xpp[5*incx] );
			VAL_ASSIGN( _xp6, xpp[6*incx] );
			VAL_ASSIGN( _xp7, xpp[7*incx] );
			VAL_MAC_CONJ( _yp0, bval[0], _x0 );
			VAL_MAC_CONJ( _yp1, bval[1], _x0 );
			VAL_MAC_CONJ( _yp2, bval[2], _x0 );
			VAL_MAC_CONJ( _yp3, bval[3], _x0 );
			VAL_MAC_CONJ( _yp4, bval[4], _x0 );
			VAL_MAC_CONJ( _yp5, bval[5], _x0 );
			VAL_MAC_CONJ( _yp6, bval[6], _x0 );
			VAL_MAC_CONJ( _yp7, bval[7], _x0 );
			VAL_MAC_CONJ( _y0, bval[0], _xp0 );
			VAL_MAC_CONJ( _y0, bval[1], _xp1 );
			VAL_MAC_CONJ( _y0, bval[2], _xp2 );
			VAL_MAC_CONJ( _y0, bval[3], _xp3 );
			VAL_MAC_CONJ( _y0, bval[4], _xp4 );
			VAL_MAC_CONJ( _y0, bval[5], _xp5 );
			VAL_MAC_CONJ( _y0, bval[6], _xp6 );
			VAL_MAC_CONJ( _y0, bval[7], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1], _yp1 );
			VAL_INC( ypp[2], _yp2 );
			VAL_INC( ypp[3], _yp3 );
			VAL_INC( ypp[4], _yp4 );
			VAL_INC( ypp[5], _yp5 );
			VAL_INC( ypp[6], _yp6 );
			VAL_INC( ypp[7], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
	}
	/* Diagonal block multiply */
	yp = y + d0;
	xp = x + d0*incx;
	for( I = 0; I < M; I++, bdiag += 1*1, yp += 1, xp += 1*incx )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _y0;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_SET_ZERO( _y0 );
		VAL_MAC_CONJ( _y0, bdiag[0], _x0 );
		VAL_MAC( yp[0], alpha, _y0 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX(). */
#define MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ysX MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ysX */
#define MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ysX MANGLE_MOD_(MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ysX_1x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot \bar{A}\cdot x\f$.
 */
void
MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ysX( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x, oski_index_t incx,
	oski_value_t* restrict y, oski_index_t incy )
{
	oski_value_t* yp = y + d0*incy;
	const oski_value_t* xp = x + d0*incx;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 1*incy, xp += 1*incx )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _x0;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_MUL( _x0, alpha, xp[0] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 1*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0*incx;
			oski_value_t* ypp = y + j0*incy;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1*incx] );
			VAL_ASSIGN( _xp2, xpp[2*incx] );
			VAL_ASSIGN( _xp3, xpp[3*incx] );
			VAL_ASSIGN( _xp4, xpp[4*incx] );
			VAL_ASSIGN( _xp5, xpp[5*incx] );
			VAL_ASSIGN( _xp6, xpp[6*incx] );
			VAL_ASSIGN( _xp7, xpp[7*incx] );
			VAL_MAC_CONJ( _yp0, bval[0], _x0 );
			VAL_MAC_CONJ( _yp1, bval[1], _x0 );
			VAL_MAC_CONJ( _yp2, bval[2], _x0 );
			VAL_MAC_CONJ( _yp3, bval[3], _x0 );
			VAL_MAC_CONJ( _yp4, bval[4], _x0 );
			VAL_MAC_CONJ( _yp5, bval[5], _x0 );
			VAL_MAC_CONJ( _yp6, bval[6], _x0 );
			VAL_MAC_CONJ( _yp7, bval[7], _x0 );
			VAL_MAC_CONJ( _y0, bval[0], _xp0 );
			VAL_MAC_CONJ( _y0, bval[1], _xp1 );
			VAL_MAC_CONJ( _y0, bval[2], _xp2 );
			VAL_MAC_CONJ( _y0, bval[3], _xp3 );
			VAL_MAC_CONJ( _y0, bval[4], _xp4 );
			VAL_MAC_CONJ( _y0, bval[5], _xp5 );
			VAL_MAC_CONJ( _y0, bval[6], _xp6 );
			VAL_MAC_CONJ( _y0, bval[7], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1*incy], _yp1 );
			VAL_INC( ypp[2*incy], _yp2 );
			VAL_INC( ypp[3*incy], _yp3 );
			VAL_INC( ypp[4*incy], _yp4 );
			VAL_INC( ypp[5*incy], _yp5 );
			VAL_INC( ypp[6*incy], _yp6 );
			VAL_INC( ypp[7*incy], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
	}
	/* Diagonal block multiply */
	yp = y + d0*incy;
	xp = x + d0*incx;
	for( I = 0; I < M; I++, bdiag += 1*1, yp += 1*incy, xp += 1*incx )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _y0;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_SET_ZERO( _y0 );
		VAL_MAC_CONJ( _y0, bdiag[0], _x0 );
		VAL_MAC( yp[0], alpha, _y0 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX
/**
 *  \brief Matrix times single-vector multiply in the conj case;
 *  see SymmMatMult_v1().
 */
#define SymmMatConjMult_v1 SymmMatMult_v1

#else /* IS_VAL_COMPLEX */


/**
 *  \brief Symmetric matrix times single-vector multiply in the conj case.
 */
static void
SymmMatConjMult_v1( oski_index_t M, oski_index_t d0,
	const oski_index_t* bptr, const oski_index_t* bind,
	const oski_value_t* bval, const oski_value_t* bdiag,
	oski_value_t alpha, const oski_value_t* x, oski_index_t incx,
	oski_value_t* y, oski_index_t incy )
{
	if( incx == 1 ) {
		if( incy == 1 ) {
			MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ys1( M, d0,
				bptr, bind, bval, bdiag, alpha, x, y );
		} else { /* general incy */
			MBCSR_SymmMatConjMult_v1_aX_b1_xs1_ysX( M, d0,
				bptr, bind, bval, bdiag, alpha, x, y, incy );
		}
	} else { /* general incx */
		if( incy == 1 ) {
			MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ys1( M, d0,
				bptr, bind, bval, bdiag, alpha, x, incx, y );
		} else { /* general incy */
			MBCSR_SymmMatConjMult_v1_aX_b1_xsX_ysX( M, d0,
				bptr, bind, bval, bdiag, alpha, x, incx, y, incy );
		}
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX
/**
 *  \brief See SymmMatMult().
 */
#define SymmMatConjMult SymmMatMult

#else /* IS_VAL_COMPLEX */



/**
 *  \brief Computes
 *  \f$y \leftarrow y + \alpha\cdot\mathrm{op}(A)\cdot x\f$,
 *  where \f$\mathrm{op}(A) = \bar{A}\f$, on the fully blocked
 *  portion of \f$A\f$.
 */
static int
SymmMatConjMult( const oski_submatMBCSR_t* A,
	oski_value_t alpha, const oski_vecview_t x_view,
	oski_vecview_t y_view )
{
	oski_index_t j; /* column number */
	const oski_value_t* xpj; /* X(:, j) */
	oski_value_t* ypj; /* Y(:, j) */

	assert( A->r == 1 );
	assert( A->c == 8 );

	for( j = 0, xpj = x_view->val, ypj = y_view->val;
		j < x_view->num_cols;
		j++, xpj += x_view->colinc, ypj += y_view->colinc )
	{
		SymmMatConjMult_v1( A->num_block_rows, A->offset,
			A->bptr, A->bind, A->bval, A->bdiag,
			alpha, xpj, x_view->rowinc, ypj, y_view->rowinc );
	}

	return 0;
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1(). */
#define MBCSR_HermMatMult_v1_aX_b1_xs1_ys1 MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_HermMatMult_v1_aX_b1_xs1_ys1 */
#define MBCSR_HermMatMult_v1_aX_b1_xs1_ys1 MANGLE_MOD_(MBCSR_HermMatMult_v1_aX_b1_xs1_ys1_1x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot A\cdot x\f$.
 */
void
MBCSR_HermMatMult_v1_aX_b1_xs1_ys1( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x,
	oski_value_t* restrict y )
{
	oski_value_t* yp = y + d0;
	const oski_value_t* xp = x + d0;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 1, xp += 1 )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _x0;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_MUL( _x0, alpha, xp[0] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 1*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0;
			oski_value_t* ypp = y + j0;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1] );
			VAL_ASSIGN( _xp2, xpp[2] );
			VAL_ASSIGN( _xp3, xpp[3] );
			VAL_ASSIGN( _xp4, xpp[4] );
			VAL_ASSIGN( _xp5, xpp[5] );
			VAL_ASSIGN( _xp6, xpp[6] );
			VAL_ASSIGN( _xp7, xpp[7] );
			VAL_MAC_CONJ( _yp0, bval[0], _x0 );
			VAL_MAC_CONJ( _yp1, bval[1], _x0 );
			VAL_MAC_CONJ( _yp2, bval[2], _x0 );
			VAL_MAC_CONJ( _yp3, bval[3], _x0 );
			VAL_MAC_CONJ( _yp4, bval[4], _x0 );
			VAL_MAC_CONJ( _yp5, bval[5], _x0 );
			VAL_MAC_CONJ( _yp6, bval[6], _x0 );
			VAL_MAC_CONJ( _yp7, bval[7], _x0 );
			VAL_MAC( _y0, bval[0], _xp0 );
			VAL_MAC( _y0, bval[1], _xp1 );
			VAL_MAC( _y0, bval[2], _xp2 );
			VAL_MAC( _y0, bval[3], _xp3 );
			VAL_MAC( _y0, bval[4], _xp4 );
			VAL_MAC( _y0, bval[5], _xp5 );
			VAL_MAC( _y0, bval[6], _xp6 );
			VAL_MAC( _y0, bval[7], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1], _yp1 );
			VAL_INC( ypp[2], _yp2 );
			VAL_INC( ypp[3], _yp3 );
			VAL_INC( ypp[4], _yp4 );
			VAL_INC( ypp[5], _yp5 );
			VAL_INC( ypp[6], _yp6 );
			VAL_INC( ypp[7], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
	}
	/* Diagonal block multiply */
	yp = y + d0;
	xp = x + d0;
	for( I = 0; I < M; I++, bdiag += 1*1, yp += 1, xp += 1 )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _y0;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_SET_ZERO( _y0 );
		VAL_MAC( _y0, bdiag[0], _x0 );
		VAL_MAC( yp[0], alpha, _y0 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX(). */
#define MBCSR_HermMatMult_v1_aX_b1_xs1_ysX MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_HermMatMult_v1_aX_b1_xs1_ysX */
#define MBCSR_HermMatMult_v1_aX_b1_xs1_ysX MANGLE_MOD_(MBCSR_HermMatMult_v1_aX_b1_xs1_ysX_1x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot A\cdot x\f$.
 */
void
MBCSR_HermMatMult_v1_aX_b1_xs1_ysX( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x,
	oski_value_t* restrict y, oski_index_t incy )
{
	oski_value_t* yp = y + d0*incy;
	const oski_value_t* xp = x + d0;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 1*incy, xp += 1 )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _x0;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_MUL( _x0, alpha, xp[0] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 1*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0;
			oski_value_t* ypp = y + j0*incy;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1] );
			VAL_ASSIGN( _xp2, xpp[2] );
			VAL_ASSIGN( _xp3, xpp[3] );
			VAL_ASSIGN( _xp4, xpp[4] );
			VAL_ASSIGN( _xp5, xpp[5] );
			VAL_ASSIGN( _xp6, xpp[6] );
			VAL_ASSIGN( _xp7, xpp[7] );
			VAL_MAC_CONJ( _yp0, bval[0], _x0 );
			VAL_MAC_CONJ( _yp1, bval[1], _x0 );
			VAL_MAC_CONJ( _yp2, bval[2], _x0 );
			VAL_MAC_CONJ( _yp3, bval[3], _x0 );
			VAL_MAC_CONJ( _yp4, bval[4], _x0 );
			VAL_MAC_CONJ( _yp5, bval[5], _x0 );
			VAL_MAC_CONJ( _yp6, bval[6], _x0 );
			VAL_MAC_CONJ( _yp7, bval[7], _x0 );
			VAL_MAC( _y0, bval[0], _xp0 );
			VAL_MAC( _y0, bval[1], _xp1 );
			VAL_MAC( _y0, bval[2], _xp2 );
			VAL_MAC( _y0, bval[3], _xp3 );
			VAL_MAC( _y0, bval[4], _xp4 );
			VAL_MAC( _y0, bval[5], _xp5 );
			VAL_MAC( _y0, bval[6], _xp6 );
			VAL_MAC( _y0, bval[7], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1*incy], _yp1 );
			VAL_INC( ypp[2*incy], _yp2 );
			VAL_INC( ypp[3*incy], _yp3 );
			VAL_INC( ypp[4*incy], _yp4 );
			VAL_INC( ypp[5*incy], _yp5 );
			VAL_INC( ypp[6*incy], _yp6 );
			VAL_INC( ypp[7*incy], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
	}
	/* Diagonal block multiply */
	yp = y + d0*incy;
	xp = x + d0;
	for( I = 0; I < M; I++, bdiag += 1*1, yp += 1*incy, xp += 1 )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _y0;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_SET_ZERO( _y0 );
		VAL_MAC( _y0, bdiag[0], _x0 );
		VAL_MAC( yp[0], alpha, _y0 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1(). */
#define MBCSR_HermMatMult_v1_aX_b1_xsX_ys1 MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_HermMatMult_v1_aX_b1_xsX_ys1 */
#define MBCSR_HermMatMult_v1_aX_b1_xsX_ys1 MANGLE_MOD_(MBCSR_HermMatMult_v1_aX_b1_xsX_ys1_1x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot A\cdot x\f$.
 */
void
MBCSR_HermMatMult_v1_aX_b1_xsX_ys1( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x, oski_index_t incx,
	oski_value_t* restrict y )
{
	oski_value_t* yp = y + d0;
	const oski_value_t* xp = x + d0*incx;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 1, xp += 1*incx )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _x0;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_MUL( _x0, alpha, xp[0] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 1*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0*incx;
			oski_value_t* ypp = y + j0;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1*incx] );
			VAL_ASSIGN( _xp2, xpp[2*incx] );
			VAL_ASSIGN( _xp3, xpp[3*incx] );
			VAL_ASSIGN( _xp4, xpp[4*incx] );
			VAL_ASSIGN( _xp5, xpp[5*incx] );
			VAL_ASSIGN( _xp6, xpp[6*incx] );
			VAL_ASSIGN( _xp7, xpp[7*incx] );
			VAL_MAC_CONJ( _yp0, bval[0], _x0 );
			VAL_MAC_CONJ( _yp1, bval[1], _x0 );
			VAL_MAC_CONJ( _yp2, bval[2], _x0 );
			VAL_MAC_CONJ( _yp3, bval[3], _x0 );
			VAL_MAC_CONJ( _yp4, bval[4], _x0 );
			VAL_MAC_CONJ( _yp5, bval[5], _x0 );
			VAL_MAC_CONJ( _yp6, bval[6], _x0 );
			VAL_MAC_CONJ( _yp7, bval[7], _x0 );
			VAL_MAC( _y0, bval[0], _xp0 );
			VAL_MAC( _y0, bval[1], _xp1 );
			VAL_MAC( _y0, bval[2], _xp2 );
			VAL_MAC( _y0, bval[3], _xp3 );
			VAL_MAC( _y0, bval[4], _xp4 );
			VAL_MAC( _y0, bval[5], _xp5 );
			VAL_MAC( _y0, bval[6], _xp6 );
			VAL_MAC( _y0, bval[7], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1], _yp1 );
			VAL_INC( ypp[2], _yp2 );
			VAL_INC( ypp[3], _yp3 );
			VAL_INC( ypp[4], _yp4 );
			VAL_INC( ypp[5], _yp5 );
			VAL_INC( ypp[6], _yp6 );
			VAL_INC( ypp[7], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
	}
	/* Diagonal block multiply */
	yp = y + d0;
	xp = x + d0*incx;
	for( I = 0; I < M; I++, bdiag += 1*1, yp += 1, xp += 1*incx )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _y0;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_SET_ZERO( _y0 );
		VAL_MAC( _y0, bdiag[0], _x0 );
		VAL_MAC( yp[0], alpha, _y0 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX(). */
#define MBCSR_HermMatMult_v1_aX_b1_xsX_ysX MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_HermMatMult_v1_aX_b1_xsX_ysX */
#define MBCSR_HermMatMult_v1_aX_b1_xsX_ysX MANGLE_MOD_(MBCSR_HermMatMult_v1_aX_b1_xsX_ysX_1x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot A\cdot x\f$.
 */
void
MBCSR_HermMatMult_v1_aX_b1_xsX_ysX( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x, oski_index_t incx,
	oski_value_t* restrict y, oski_index_t incy )
{
	oski_value_t* yp = y + d0*incy;
	const oski_value_t* xp = x + d0*incx;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 1*incy, xp += 1*incx )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _x0;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_MUL( _x0, alpha, xp[0] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 1*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0*incx;
			oski_value_t* ypp = y + j0*incy;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1*incx] );
			VAL_ASSIGN( _xp2, xpp[2*incx] );
			VAL_ASSIGN( _xp3, xpp[3*incx] );
			VAL_ASSIGN( _xp4, xpp[4*incx] );
			VAL_ASSIGN( _xp5, xpp[5*incx] );
			VAL_ASSIGN( _xp6, xpp[6*incx] );
			VAL_ASSIGN( _xp7, xpp[7*incx] );
			VAL_MAC_CONJ( _yp0, bval[0], _x0 );
			VAL_MAC_CONJ( _yp1, bval[1], _x0 );
			VAL_MAC_CONJ( _yp2, bval[2], _x0 );
			VAL_MAC_CONJ( _yp3, bval[3], _x0 );
			VAL_MAC_CONJ( _yp4, bval[4], _x0 );
			VAL_MAC_CONJ( _yp5, bval[5], _x0 );
			VAL_MAC_CONJ( _yp6, bval[6], _x0 );
			VAL_MAC_CONJ( _yp7, bval[7], _x0 );
			VAL_MAC( _y0, bval[0], _xp0 );
			VAL_MAC( _y0, bval[1], _xp1 );
			VAL_MAC( _y0, bval[2], _xp2 );
			VAL_MAC( _y0, bval[3], _xp3 );
			VAL_MAC( _y0, bval[4], _xp4 );
			VAL_MAC( _y0, bval[5], _xp5 );
			VAL_MAC( _y0, bval[6], _xp6 );
			VAL_MAC( _y0, bval[7], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1*incy], _yp1 );
			VAL_INC( ypp[2*incy], _yp2 );
			VAL_INC( ypp[3*incy], _yp3 );
			VAL_INC( ypp[4*incy], _yp4 );
			VAL_INC( ypp[5*incy], _yp5 );
			VAL_INC( ypp[6*incy], _yp6 );
			VAL_INC( ypp[7*incy], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
	}
	/* Diagonal block multiply */
	yp = y + d0*incy;
	xp = x + d0*incx;
	for( I = 0; I < M; I++, bdiag += 1*1, yp += 1*incy, xp += 1*incx )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _y0;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_SET_ZERO( _y0 );
		VAL_MAC( _y0, bdiag[0], _x0 );
		VAL_MAC( yp[0], alpha, _y0 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX
/**
 *  \brief Matrix times single-vector multiply in the normal case;
 *  see SymmMatMult_v1().
 */
#define HermMatMult_v1 SymmMatMult_v1

#else /* IS_VAL_COMPLEX */


/**
 *  \brief Hermitian matrix times single-vector multiply in the normal case.
 */
static void
HermMatMult_v1( oski_index_t M, oski_index_t d0,
	const oski_index_t* bptr, const oski_index_t* bind,
	const oski_value_t* bval, const oski_value_t* bdiag,
	oski_value_t alpha, const oski_value_t* x, oski_index_t incx,
	oski_value_t* y, oski_index_t incy )
{
	if( incx == 1 ) {
		if( incy == 1 ) {
			MBCSR_HermMatMult_v1_aX_b1_xs1_ys1( M, d0,
				bptr, bind, bval, bdiag, alpha, x, y );
		} else { /* general incy */
			MBCSR_HermMatMult_v1_aX_b1_xs1_ysX( M, d0,
				bptr, bind, bval, bdiag, alpha, x, y, incy );
		}
	} else { /* general incx */
		if( incy == 1 ) {
			MBCSR_HermMatMult_v1_aX_b1_xsX_ys1( M, d0,
				bptr, bind, bval, bdiag, alpha, x, incx, y );
		} else { /* general incy */
			MBCSR_HermMatMult_v1_aX_b1_xsX_ysX( M, d0,
				bptr, bind, bval, bdiag, alpha, x, incx, y, incy );
		}
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX
/**
 *  \brief See SymmMatMult().
 */
#define HermMatMult SymmMatMult

#else /* IS_VAL_COMPLEX */



/**
 *  \brief Computes
 *  \f$y \leftarrow y + \alpha\cdot\mathrm{op}(A)\cdot x\f$,
 *  where \f$\mathrm{op}(A) = A\f$, on the fully blocked
 *  portion of \f$A\f$.
 */
static int
HermMatMult( const oski_submatMBCSR_t* A,
	oski_value_t alpha, const oski_vecview_t x_view,
	oski_vecview_t y_view )
{
	oski_index_t j; /* column number */
	const oski_value_t* xpj; /* X(:, j) */
	oski_value_t* ypj; /* Y(:, j) */

	assert( A->r == 1 );
	assert( A->c == 8 );

	for( j = 0, xpj = x_view->val, ypj = y_view->val;
		j < x_view->num_cols;
		j++, xpj += x_view->colinc, ypj += y_view->colinc )
	{
		HermMatMult_v1( A->num_block_rows, A->offset,
			A->bptr, A->bind, A->bval, A->bdiag,
			alpha, xpj, x_view->rowinc, ypj, y_view->rowinc );
	}

	return 0;
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1(). */
#define MBCSR_HermMatConjMult_v1_aX_b1_xs1_ys1 MBCSR_SymmMatMult_v1_aX_b1_xs1_ys1
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_HermMatConjMult_v1_aX_b1_xs1_ys1 */
#define MBCSR_HermMatConjMult_v1_aX_b1_xs1_ys1 MANGLE_MOD_(MBCSR_HermMatConjMult_v1_aX_b1_xs1_ys1_1x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot \bar{A}\cdot x\f$.
 */
void
MBCSR_HermMatConjMult_v1_aX_b1_xs1_ys1( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x,
	oski_value_t* restrict y )
{
	oski_value_t* yp = y + d0;
	const oski_value_t* xp = x + d0;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 1, xp += 1 )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _x0;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_MUL( _x0, alpha, xp[0] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 1*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0;
			oski_value_t* ypp = y + j0;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1] );
			VAL_ASSIGN( _xp2, xpp[2] );
			VAL_ASSIGN( _xp3, xpp[3] );
			VAL_ASSIGN( _xp4, xpp[4] );
			VAL_ASSIGN( _xp5, xpp[5] );
			VAL_ASSIGN( _xp6, xpp[6] );
			VAL_ASSIGN( _xp7, xpp[7] );
			VAL_MAC( _yp0, bval[0], _x0 );
			VAL_MAC( _yp1, bval[1], _x0 );
			VAL_MAC( _yp2, bval[2], _x0 );
			VAL_MAC( _yp3, bval[3], _x0 );
			VAL_MAC( _yp4, bval[4], _x0 );
			VAL_MAC( _yp5, bval[5], _x0 );
			VAL_MAC( _yp6, bval[6], _x0 );
			VAL_MAC( _yp7, bval[7], _x0 );
			VAL_MAC_CONJ( _y0, bval[0], _xp0 );
			VAL_MAC_CONJ( _y0, bval[1], _xp1 );
			VAL_MAC_CONJ( _y0, bval[2], _xp2 );
			VAL_MAC_CONJ( _y0, bval[3], _xp3 );
			VAL_MAC_CONJ( _y0, bval[4], _xp4 );
			VAL_MAC_CONJ( _y0, bval[5], _xp5 );
			VAL_MAC_CONJ( _y0, bval[6], _xp6 );
			VAL_MAC_CONJ( _y0, bval[7], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1], _yp1 );
			VAL_INC( ypp[2], _yp2 );
			VAL_INC( ypp[3], _yp3 );
			VAL_INC( ypp[4], _yp4 );
			VAL_INC( ypp[5], _yp5 );
			VAL_INC( ypp[6], _yp6 );
			VAL_INC( ypp[7], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
	}
	/* Diagonal block multiply */
	yp = y + d0;
	xp = x + d0;
	for( I = 0; I < M; I++, bdiag += 1*1, yp += 1, xp += 1 )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _y0;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_SET_ZERO( _y0 );
		VAL_MAC_CONJ( _y0, bdiag[0], _x0 );
		VAL_MAC( yp[0], alpha, _y0 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX(). */
#define MBCSR_HermMatConjMult_v1_aX_b1_xs1_ysX MBCSR_SymmMatMult_v1_aX_b1_xs1_ysX
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_HermMatConjMult_v1_aX_b1_xs1_ysX */
#define MBCSR_HermMatConjMult_v1_aX_b1_xs1_ysX MANGLE_MOD_(MBCSR_HermMatConjMult_v1_aX_b1_xs1_ysX_1x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot \bar{A}\cdot x\f$.
 */
void
MBCSR_HermMatConjMult_v1_aX_b1_xs1_ysX( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x,
	oski_value_t* restrict y, oski_index_t incy )
{
	oski_value_t* yp = y + d0*incy;
	const oski_value_t* xp = x + d0;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 1*incy, xp += 1 )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _x0;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_MUL( _x0, alpha, xp[0] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 1*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0;
			oski_value_t* ypp = y + j0*incy;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1] );
			VAL_ASSIGN( _xp2, xpp[2] );
			VAL_ASSIGN( _xp3, xpp[3] );
			VAL_ASSIGN( _xp4, xpp[4] );
			VAL_ASSIGN( _xp5, xpp[5] );
			VAL_ASSIGN( _xp6, xpp[6] );
			VAL_ASSIGN( _xp7, xpp[7] );
			VAL_MAC( _yp0, bval[0], _x0 );
			VAL_MAC( _yp1, bval[1], _x0 );
			VAL_MAC( _yp2, bval[2], _x0 );
			VAL_MAC( _yp3, bval[3], _x0 );
			VAL_MAC( _yp4, bval[4], _x0 );
			VAL_MAC( _yp5, bval[5], _x0 );
			VAL_MAC( _yp6, bval[6], _x0 );
			VAL_MAC( _yp7, bval[7], _x0 );
			VAL_MAC_CONJ( _y0, bval[0], _xp0 );
			VAL_MAC_CONJ( _y0, bval[1], _xp1 );
			VAL_MAC_CONJ( _y0, bval[2], _xp2 );
			VAL_MAC_CONJ( _y0, bval[3], _xp3 );
			VAL_MAC_CONJ( _y0, bval[4], _xp4 );
			VAL_MAC_CONJ( _y0, bval[5], _xp5 );
			VAL_MAC_CONJ( _y0, bval[6], _xp6 );
			VAL_MAC_CONJ( _y0, bval[7], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1*incy], _yp1 );
			VAL_INC( ypp[2*incy], _yp2 );
			VAL_INC( ypp[3*incy], _yp3 );
			VAL_INC( ypp[4*incy], _yp4 );
			VAL_INC( ypp[5*incy], _yp5 );
			VAL_INC( ypp[6*incy], _yp6 );
			VAL_INC( ypp[7*incy], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
	}
	/* Diagonal block multiply */
	yp = y + d0*incy;
	xp = x + d0;
	for( I = 0; I < M; I++, bdiag += 1*1, yp += 1*incy, xp += 1 )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _y0;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_SET_ZERO( _y0 );
		VAL_MAC_CONJ( _y0, bdiag[0], _x0 );
		VAL_MAC( yp[0], alpha, _y0 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1(). */
#define MBCSR_HermMatConjMult_v1_aX_b1_xsX_ys1 MBCSR_SymmMatMult_v1_aX_b1_xsX_ys1
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_HermMatConjMult_v1_aX_b1_xsX_ys1 */
#define MBCSR_HermMatConjMult_v1_aX_b1_xsX_ys1 MANGLE_MOD_(MBCSR_HermMatConjMult_v1_aX_b1_xsX_ys1_1x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot \bar{A}\cdot x\f$.
 */
void
MBCSR_HermMatConjMult_v1_aX_b1_xsX_ys1( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x, oski_index_t incx,
	oski_value_t* restrict y )
{
	oski_value_t* yp = y + d0;
	const oski_value_t* xp = x + d0*incx;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 1, xp += 1*incx )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _x0;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_MUL( _x0, alpha, xp[0] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 1*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0*incx;
			oski_value_t* ypp = y + j0;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1*incx] );
			VAL_ASSIGN( _xp2, xpp[2*incx] );
			VAL_ASSIGN( _xp3, xpp[3*incx] );
			VAL_ASSIGN( _xp4, xpp[4*incx] );
			VAL_ASSIGN( _xp5, xpp[5*incx] );
			VAL_ASSIGN( _xp6, xpp[6*incx] );
			VAL_ASSIGN( _xp7, xpp[7*incx] );
			VAL_MAC( _yp0, bval[0], _x0 );
			VAL_MAC( _yp1, bval[1], _x0 );
			VAL_MAC( _yp2, bval[2], _x0 );
			VAL_MAC( _yp3, bval[3], _x0 );
			VAL_MAC( _yp4, bval[4], _x0 );
			VAL_MAC( _yp5, bval[5], _x0 );
			VAL_MAC( _yp6, bval[6], _x0 );
			VAL_MAC( _yp7, bval[7], _x0 );
			VAL_MAC_CONJ( _y0, bval[0], _xp0 );
			VAL_MAC_CONJ( _y0, bval[1], _xp1 );
			VAL_MAC_CONJ( _y0, bval[2], _xp2 );
			VAL_MAC_CONJ( _y0, bval[3], _xp3 );
			VAL_MAC_CONJ( _y0, bval[4], _xp4 );
			VAL_MAC_CONJ( _y0, bval[5], _xp5 );
			VAL_MAC_CONJ( _y0, bval[6], _xp6 );
			VAL_MAC_CONJ( _y0, bval[7], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1], _yp1 );
			VAL_INC( ypp[2], _yp2 );
			VAL_INC( ypp[3], _yp3 );
			VAL_INC( ypp[4], _yp4 );
			VAL_INC( ypp[5], _yp5 );
			VAL_INC( ypp[6], _yp6 );
			VAL_INC( ypp[7], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
	}
	/* Diagonal block multiply */
	yp = y + d0;
	xp = x + d0*incx;
	for( I = 0; I < M; I++, bdiag += 1*1, yp += 1, xp += 1*incx )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _y0;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_SET_ZERO( _y0 );
		VAL_MAC_CONJ( _y0, bdiag[0], _x0 );
		VAL_MAC( yp[0], alpha, _y0 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX

#if defined(DO_NAME_MANGLING)
/** See MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX(). */
#define MBCSR_HermMatConjMult_v1_aX_b1_xsX_ysX MBCSR_SymmMatMult_v1_aX_b1_xsX_ysX
#endif

#else /* IS_VAL_COMPLEX */



#if defined(DO_NAME_MANGLING)
/** Mangled name for MBCSR_HermMatConjMult_v1_aX_b1_xsX_ysX */
#define MBCSR_HermMatConjMult_v1_aX_b1_xsX_ysX MANGLE_MOD_(MBCSR_HermMatConjMult_v1_aX_b1_xsX_ysX_1x8)
#endif

/**
 *  \brief MBCSR implementation of
 *  \f$y \leftarrow y + \alpha\cdot \bar{A}\cdot x\f$.
 */
void
MBCSR_HermMatConjMult_v1_aX_b1_xsX_ysX( oski_index_t M, oski_index_t d0,
	const oski_index_t* restrict bptr, const oski_index_t* restrict bind,
	const oski_value_t* restrict bval, const oski_value_t* restrict bdiag,
	oski_value_t alpha, const oski_value_t* restrict x, oski_index_t incx,
	oski_value_t* restrict y, oski_index_t incy )
{
	oski_value_t* yp = y + d0*incy;
	const oski_value_t* xp = x + d0*incx;
	oski_index_t I;

	for( I = 0; I < M; I++, yp += 1*incy, xp += 1*incx )
	{
		REGISTER oski_value_t _y0;
		REGISTER oski_value_t _x0;
		oski_index_t K;

		VAL_SET_ZERO( _y0 );
		VAL_MUL( _x0, alpha, xp[0] );

		for( K = bptr[I]; K < bptr[I+1]; K++, bind++, bval += 1*8 )
		{
			oski_index_t j0 = bind[0]; /* block's leftmost col index */
			const oski_value_t* xpp = x + j0*incx;
			oski_value_t* ypp = y + j0*incy;

			REGISTER oski_value_t _xp0;
			REGISTER oski_value_t _xp1;
			REGISTER oski_value_t _xp2;
			REGISTER oski_value_t _xp3;
			REGISTER oski_value_t _xp4;
			REGISTER oski_value_t _xp5;
			REGISTER oski_value_t _xp6;
			REGISTER oski_value_t _xp7;
			REGISTER oski_value_t _yp0;
			REGISTER oski_value_t _yp1;
			REGISTER oski_value_t _yp2;
			REGISTER oski_value_t _yp3;
			REGISTER oski_value_t _yp4;
			REGISTER oski_value_t _yp5;
			REGISTER oski_value_t _yp6;
			REGISTER oski_value_t _yp7;
			VAL_SET_ZERO( _yp0 );
			VAL_SET_ZERO( _yp1 );
			VAL_SET_ZERO( _yp2 );
			VAL_SET_ZERO( _yp3 );
			VAL_SET_ZERO( _yp4 );
			VAL_SET_ZERO( _yp5 );
			VAL_SET_ZERO( _yp6 );
			VAL_SET_ZERO( _yp7 );
			VAL_ASSIGN( _xp0, xpp[0] );
			VAL_ASSIGN( _xp1, xpp[1*incx] );
			VAL_ASSIGN( _xp2, xpp[2*incx] );
			VAL_ASSIGN( _xp3, xpp[3*incx] );
			VAL_ASSIGN( _xp4, xpp[4*incx] );
			VAL_ASSIGN( _xp5, xpp[5*incx] );
			VAL_ASSIGN( _xp6, xpp[6*incx] );
			VAL_ASSIGN( _xp7, xpp[7*incx] );
			VAL_MAC( _yp0, bval[0], _x0 );
			VAL_MAC( _yp1, bval[1], _x0 );
			VAL_MAC( _yp2, bval[2], _x0 );
			VAL_MAC( _yp3, bval[3], _x0 );
			VAL_MAC( _yp4, bval[4], _x0 );
			VAL_MAC( _yp5, bval[5], _x0 );
			VAL_MAC( _yp6, bval[6], _x0 );
			VAL_MAC( _yp7, bval[7], _x0 );
			VAL_MAC_CONJ( _y0, bval[0], _xp0 );
			VAL_MAC_CONJ( _y0, bval[1], _xp1 );
			VAL_MAC_CONJ( _y0, bval[2], _xp2 );
			VAL_MAC_CONJ( _y0, bval[3], _xp3 );
			VAL_MAC_CONJ( _y0, bval[4], _xp4 );
			VAL_MAC_CONJ( _y0, bval[5], _xp5 );
			VAL_MAC_CONJ( _y0, bval[6], _xp6 );
			VAL_MAC_CONJ( _y0, bval[7], _xp7 );
			VAL_INC( ypp[0], _yp0 );
			VAL_INC( ypp[1*incy], _yp1 );
			VAL_INC( ypp[2*incy], _yp2 );
			VAL_INC( ypp[3*incy], _yp3 );
			VAL_INC( ypp[4*incy], _yp4 );
			VAL_INC( ypp[5*incy], _yp5 );
			VAL_INC( ypp[6*incy], _yp6 );
			VAL_INC( ypp[7*incy], _yp7 );
		}
		VAL_MAC( yp[0], alpha, _y0 );
	}
	/* Diagonal block multiply */
	yp = y + d0*incy;
	xp = x + d0*incx;
	for( I = 0; I < M; I++, bdiag += 1*1, yp += 1*incy, xp += 1*incx )
	{
		REGISTER oski_value_t _x0;
		REGISTER oski_value_t _y0;
		VAL_ASSIGN( _x0, xp[0] );
		VAL_SET_ZERO( _y0 );
		VAL_MAC_CONJ( _y0, bdiag[0], _x0 );
		VAL_MAC( yp[0], alpha, _y0 );
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX
/**
 *  \brief Matrix times single-vector multiply in the conj case;
 *  see SymmMatMult_v1().
 */
#define HermMatConjMult_v1 SymmMatMult_v1

#else /* IS_VAL_COMPLEX */


/**
 *  \brief Hermitian matrix times single-vector multiply in the conj case.
 */
static void
HermMatConjMult_v1( oski_index_t M, oski_index_t d0,
	const oski_index_t* bptr, const oski_index_t* bind,
	const oski_value_t* bval, const oski_value_t* bdiag,
	oski_value_t alpha, const oski_value_t* x, oski_index_t incx,
	oski_value_t* y, oski_index_t incy )
{
	if( incx == 1 ) {
		if( incy == 1 ) {
			MBCSR_HermMatConjMult_v1_aX_b1_xs1_ys1( M, d0,
				bptr, bind, bval, bdiag, alpha, x, y );
		} else { /* general incy */
			MBCSR_HermMatConjMult_v1_aX_b1_xs1_ysX( M, d0,
				bptr, bind, bval, bdiag, alpha, x, y, incy );
		}
	} else { /* general incx */
		if( incy == 1 ) {
			MBCSR_HermMatConjMult_v1_aX_b1_xsX_ys1( M, d0,
				bptr, bind, bval, bdiag, alpha, x, incx, y );
		} else { /* general incy */
			MBCSR_HermMatConjMult_v1_aX_b1_xsX_ysX( M, d0,
				bptr, bind, bval, bdiag, alpha, x, incx, y, incy );
		}
	}
}


#endif /* !IS_VAL_COMPLEX */


#if !IS_VAL_COMPLEX
/**
 *  \brief See SymmMatMult().
 */
#define HermMatConjMult SymmMatMult

#else /* IS_VAL_COMPLEX */



/**
 *  \brief Computes
 *  \f$y \leftarrow y + \alpha\cdot\mathrm{op}(A)\cdot x\f$,
 *  where \f$\mathrm{op}(A) = \bar{A}\f$, on the fully blocked
 *  portion of \f$A\f$.
 */
static int
HermMatConjMult( const oski_submatMBCSR_t* A,
	oski_value_t alpha, const oski_vecview_t x_view,
	oski_vecview_t y_view )
{
	oski_index_t j; /* column number */
	const oski_value_t* xpj; /* X(:, j) */
	oski_value_t* ypj; /* Y(:, j) */

	assert( A->r == 1 );
	assert( A->c == 8 );

	for( j = 0, xpj = x_view->val, ypj = y_view->val;
		j < x_view->num_cols;
		j++, xpj += x_view->colinc, ypj += y_view->colinc )
	{
		HermMatConjMult_v1( A->num_block_rows, A->offset,
			A->bptr, A->bind, A->bval, A->bdiag,
			alpha, xpj, x_view->rowinc, ypj, y_view->rowinc );
	}

	return 0;
}


#endif /* !IS_VAL_COMPLEX */


#if defined(DO_NAME_MANGLING)
/** Mangled name for primary exported symbol */
#define SymmSubmatReprMult MANGLE_MOD_(SymmSubmatReprMult_1x8)
#endif

/**
 *  \brief Computes
 *  \f$y \leftarrow y + \alpha\cdot\mathrm{op}(A)\cdot x\f$,
 *  where \f$A\f$ is stored in 1x8 MBCSR format and
 *  either \f = A^T\f$ or \f = \bar{A}^T\f$.
 *
 *  Set is_herm to a non-zero value if \f\f$ is Hermitian, or
 *  0 if it is symmetric.
 */
int
SymmSubmatReprMult( const oski_submatMBCSR_t* A, int is_herm,
	oski_matop_t opA,
	oski_value_t alpha, const oski_vecview_t x_view,
	oski_vecview_t y_view )
{
	int err;

	if( is_herm ) {
		switch( opA )
		{
			case OP_NORMAL:
			case OP_CONJ_TRANS:
				err = HermMatMult( A, alpha, x_view, y_view );
				break;
			case OP_CONJ:
			case OP_TRANS:
				err = HermMatConjMult( A, alpha, x_view, y_view );
				break;
			default:
				OSKI_ERR_BAD_MATOP( SubmatReprMult, 3, opA );
				err = ERR_BAD_ARG;
		}
	} else { /* is symmetric, but not Hermitian (!is_herm) */
		switch( opA )
		{
			case OP_NORMAL:
			case OP_TRANS:
				err = SymmMatMult( A, alpha, x_view, y_view );
				break;
			case OP_CONJ_TRANS:
			case OP_CONJ:
				err = SymmMatConjMult( A, alpha, x_view, y_view );
				break;
			default:
				OSKI_ERR_BAD_MATOP( SubmatReprMult, 3, opA );
				err = ERR_BAD_ARG;
		}
	}

	return err;
}


/* eof */
