#line 1 "numpy/core/src/multiarray/einsum_sumprod.c.src"

/*
 *****************************************************************************
 **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
 **       Changes should be made to the original source (.src) file         **
 *****************************************************************************
 */

#line 1
/*
 * This file provides optimized sum of product implementations used internally
 * by einsum.
 *
 * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com)
 * The University of British Columbia
 *
 * See LICENSE.txt for the license.
 */

#define NPY_NO_DEPRECATED_API NPY_API_VERSION
#define _MULTIARRAYMODULE

#include <numpy/npy_common.h>
#include <numpy/ndarraytypes.h>  /* for NPY_NTYPES */
#include <numpy/halffloat.h>

#include "einsum_sumprod.h"
#include "einsum_debug.h"
#include "simd/simd.h"
#include "common.h"

// ARM/Neon don't have instructions for aligned memory access
#ifdef NPY_HAVE_NEON
    #define EINSUM_IS_ALIGNED(x) 0
#else
    #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
#endif

/**********************************************/

#line 74

#if !0
static NPY_GCC_OPT_3 npy_byte byte_sum_of_arr(npy_byte *data, npy_intp count)
{
    npy_byte accum = 0;
#if 0 // NPYV check for npy_byte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data);
    const int vstep = npyv_nlanes_s8;
    npyv_s8 vaccum = npyv_zero_s8();
    const npy_intp vstepx4 = vstep * 4;

    #line 91
    if(is_aligned) {
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
            #line 96
            npyv_s8 a0 = npyv_loada_s8(data + vstep * 0);
            
#line 96
            npyv_s8 a1 = npyv_loada_s8(data + vstep * 1);
            
#line 96
            npyv_s8 a2 = npyv_loada_s8(data + vstep * 2);
            
#line 96
            npyv_s8 a3 = npyv_loada_s8(data + vstep * 3);
            
            npyv_s8 a01   = npyv_add_s8(a0, a1);
            npyv_s8 a23   = npyv_add_s8(a2, a3);
            npyv_s8 a0123 = npyv_add_s8(a01, a23);
                      vaccum = npyv_add_s8(a0123, vaccum);
        }
    }
    
#line 91
    else {
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
            #line 96
            npyv_s8 a0 = npyv_load_s8(data + vstep * 0);
            
#line 96
            npyv_s8 a1 = npyv_load_s8(data + vstep * 1);
            
#line 96
            npyv_s8 a2 = npyv_load_s8(data + vstep * 2);
            
#line 96
            npyv_s8 a3 = npyv_load_s8(data + vstep * 3);
            
            npyv_s8 a01   = npyv_add_s8(a0, a1);
            npyv_s8 a23   = npyv_add_s8(a2, a3);
            npyv_s8 a0123 = npyv_add_s8(a01, a23);
                      vaccum = npyv_add_s8(a0123, vaccum);
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep) {
        npyv_s8 a = npyv_load_tillz_s8(data, count);
        vaccum = npyv_add_s8(a, vaccum);
    }
    accum = npyv_sum_s8(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data += 4) {
        const npy_byte a01 = (*data) + (data[1]);
        const npy_byte a23 = (data[2]) + (data[3]);
        accum +=  a01 + a23;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data++) {
        accum += (*data);
    }
#endif // NPYV check for npy_byte
    return accum;
}
#endif

#line 131
static void
byte_sum_of_products_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1 == 1) || (1 <= 3 && !0)
    char *data_out = dataptr[1];
    npy_intp stride_out = strides[1];
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_one (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        *(npy_byte *)data_out = ((*(npy_byte *)data0) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1 == 2
        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
                                         (*(npy_byte *)data1) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1 == 3
        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
                                         (*(npy_byte *)data1) *
                                         (*(npy_byte *)data2) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        *(npy_byte *)dataptr[nop] = (temp +
                                           (*(npy_byte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        ((npy_byte *)data_out)[0] = ((npy_byte *)data0)[0] +
                                         ((npy_byte *)data_out)[0];
        ((npy_byte *)data_out)[1] = ((npy_byte *)data0)[1] +
                                         ((npy_byte *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1 == 1

static void
byte_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data_out = (npy_byte *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_byte *)data_out + 2*6)[0] =
                                    ((npy_byte *)data0 + 2*6)[0] +
                                    ((npy_byte *)data_out + 2*6)[0];
            ((npy_byte *)data_out + 2*6)[1] =
                                    ((npy_byte *)data0 + 2*6)[1] +
                                    ((npy_byte *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_byte *)data_out + 2*5)[0] =
                                    ((npy_byte *)data0 + 2*5)[0] +
                                    ((npy_byte *)data_out + 2*5)[0];
            ((npy_byte *)data_out + 2*5)[1] =
                                    ((npy_byte *)data0 + 2*5)[1] +
                                    ((npy_byte *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_byte *)data_out + 2*4)[0] =
                                    ((npy_byte *)data0 + 2*4)[0] +
                                    ((npy_byte *)data_out + 2*4)[0];
            ((npy_byte *)data_out + 2*4)[1] =
                                    ((npy_byte *)data0 + 2*4)[1] +
                                    ((npy_byte *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_byte *)data_out + 2*3)[0] =
                                    ((npy_byte *)data0 + 2*3)[0] +
                                    ((npy_byte *)data_out + 2*3)[0];
            ((npy_byte *)data_out + 2*3)[1] =
                                    ((npy_byte *)data0 + 2*3)[1] +
                                    ((npy_byte *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_byte *)data_out + 2*2)[0] =
                                    ((npy_byte *)data0 + 2*2)[0] +
                                    ((npy_byte *)data_out + 2*2)[0];
            ((npy_byte *)data_out + 2*2)[1] =
                                    ((npy_byte *)data0 + 2*2)[1] +
                                    ((npy_byte *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_byte *)data_out + 2*1)[0] =
                                    ((npy_byte *)data0 + 2*1)[0] +
                                    ((npy_byte *)data_out + 2*1)[0];
            ((npy_byte *)data_out + 2*1)[1] =
                                    ((npy_byte *)data0 + 2*1)[1] +
                                    ((npy_byte *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_byte *)data_out + 2*0)[0] =
                                    ((npy_byte *)data0 + 2*0)[0] +
                                    ((npy_byte *)data_out + 2*0)[0];
            ((npy_byte *)data_out + 2*0)[1] =
                                    ((npy_byte *)data0 + 2*0)[1] +
                                    ((npy_byte *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_byte *)data_out + 2*0)[0] =
                                ((npy_byte *)data0 + 2*0)[0] +
                                ((npy_byte *)data_out + 2*0)[0];
        ((npy_byte *)data_out + 2*0)[1] =
                                ((npy_byte *)data0 + 2*0)[1] +
                                ((npy_byte *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_byte *)data_out + 2*1)[0] =
                                ((npy_byte *)data0 + 2*1)[0] +
                                ((npy_byte *)data_out + 2*1)[0];
        ((npy_byte *)data_out + 2*1)[1] =
                                ((npy_byte *)data0 + 2*1)[1] +
                                ((npy_byte *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_byte *)data_out + 2*2)[0] =
                                ((npy_byte *)data0 + 2*2)[0] +
                                ((npy_byte *)data_out + 2*2)[0];
        ((npy_byte *)data_out + 2*2)[1] =
                                ((npy_byte *)data0 + 2*2)[1] +
                                ((npy_byte *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_byte *)data_out + 2*3)[0] =
                                ((npy_byte *)data0 + 2*3)[0] +
                                ((npy_byte *)data_out + 2*3)[0];
        ((npy_byte *)data_out + 2*3)[1] =
                                ((npy_byte *)data0 + 2*3)[1] +
                                ((npy_byte *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_byte *)data_out + 2*4)[0] =
                                ((npy_byte *)data0 + 2*4)[0] +
                                ((npy_byte *)data_out + 2*4)[0];
        ((npy_byte *)data_out + 2*4)[1] =
                                ((npy_byte *)data0 + 2*4)[1] +
                                ((npy_byte *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_byte *)data_out + 2*5)[0] =
                                ((npy_byte *)data0 + 2*5)[0] +
                                ((npy_byte *)data_out + 2*5)[0];
        ((npy_byte *)data_out + 2*5)[1] =
                                ((npy_byte *)data0 + 2*5)[1] +
                                ((npy_byte *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_byte *)data_out + 2*6)[0] =
                                ((npy_byte *)data0 + 2*6)[0] +
                                ((npy_byte *)data_out + 2*6)[0];
        ((npy_byte *)data_out + 2*6)[1] =
                                ((npy_byte *)data0 + 2*6)[1] +
                                ((npy_byte *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_byte *)data_out + 2*7)[0] =
                                ((npy_byte *)data0 + 2*7)[0] +
                                ((npy_byte *)data_out + 2*7)[0];
        ((npy_byte *)data_out + 2*7)[1] =
                                ((npy_byte *)data0 + 2*7)[1] +
                                ((npy_byte *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
byte_sum_of_products_muladd(npy_byte *data, npy_byte *data_out, npy_byte scalar, npy_intp count)
{
#if 0 // NPYV check for npy_byte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s8;
    const npyv_s8 v_scalar = npyv_setall_s8(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s8 b0 = npyv_loada_s8(data + vstep * 0);
            npyv_s8 c0 = npyv_loada_s8(data_out + vstep * 0);
            
#line 312
            npyv_s8 b1 = npyv_loada_s8(data + vstep * 1);
            npyv_s8 c1 = npyv_loada_s8(data_out + vstep * 1);
            
#line 312
            npyv_s8 b2 = npyv_loada_s8(data + vstep * 2);
            npyv_s8 c2 = npyv_loada_s8(data_out + vstep * 2);
            
#line 312
            npyv_s8 b3 = npyv_loada_s8(data + vstep * 3);
            npyv_s8 c3 = npyv_loada_s8(data_out + vstep * 3);
            
            #line 318
            npyv_s8 abc0 = npyv_muladd_s8(v_scalar, b0, c0);
            
#line 318
            npyv_s8 abc1 = npyv_muladd_s8(v_scalar, b1, c1);
            
#line 318
            npyv_s8 abc2 = npyv_muladd_s8(v_scalar, b2, c2);
            
#line 318
            npyv_s8 abc3 = npyv_muladd_s8(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_s8(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_s8(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_s8(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_s8(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s8 b0 = npyv_load_s8(data + vstep * 0);
            npyv_s8 c0 = npyv_load_s8(data_out + vstep * 0);
            
#line 312
            npyv_s8 b1 = npyv_load_s8(data + vstep * 1);
            npyv_s8 c1 = npyv_load_s8(data_out + vstep * 1);
            
#line 312
            npyv_s8 b2 = npyv_load_s8(data + vstep * 2);
            npyv_s8 c2 = npyv_load_s8(data_out + vstep * 2);
            
#line 312
            npyv_s8 b3 = npyv_load_s8(data + vstep * 3);
            npyv_s8 c3 = npyv_load_s8(data_out + vstep * 3);
            
            #line 318
            npyv_s8 abc0 = npyv_muladd_s8(v_scalar, b0, c0);
            
#line 318
            npyv_s8 abc1 = npyv_muladd_s8(v_scalar, b1, c1);
            
#line 318
            npyv_s8 abc2 = npyv_muladd_s8(v_scalar, b2, c2);
            
#line 318
            npyv_s8 abc3 = npyv_muladd_s8(v_scalar, b3, c3);
            
            #line 323
            npyv_store_s8(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_s8(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_s8(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_s8(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_s8 a = npyv_load_tillz_s8(data, count);
        npyv_s8 b = npyv_load_tillz_s8(data_out, count);
        npyv_store_till_s8(data_out, count, npyv_muladd_s8(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_byte b0 = (data[0]);
        const npy_byte c0 = (data_out[0]);
        
#line 340
        const npy_byte b1 = (data[1]);
        const npy_byte c1 = (data_out[1]);
        
#line 340
        const npy_byte b2 = (data[2]);
        const npy_byte c2 = (data_out[2]);
        
#line 340
        const npy_byte b3 = (data[3]);
        const npy_byte c3 = (data_out[3]);
        
        #line 346
        const npy_byte abc0 = scalar * b0 + c0;
        
#line 346
        const npy_byte abc1 = scalar * b1 + c1;
        
#line 346
        const npy_byte abc2 = scalar * b2 + c2;
        
#line 346
        const npy_byte abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_byte b = (*data);
        const npy_byte c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_byte
}

static void
byte_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data_out = (npy_byte *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_byte
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s8;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s8 a0 = npyv_loada_s8(data0 + vstep * 0);
            npyv_s8 b0 = npyv_loada_s8(data1 + vstep * 0);
            npyv_s8 c0 = npyv_loada_s8(data_out + vstep * 0);
            
#line 390
            npyv_s8 a1 = npyv_loada_s8(data0 + vstep * 1);
            npyv_s8 b1 = npyv_loada_s8(data1 + vstep * 1);
            npyv_s8 c1 = npyv_loada_s8(data_out + vstep * 1);
            
#line 390
            npyv_s8 a2 = npyv_loada_s8(data0 + vstep * 2);
            npyv_s8 b2 = npyv_loada_s8(data1 + vstep * 2);
            npyv_s8 c2 = npyv_loada_s8(data_out + vstep * 2);
            
#line 390
            npyv_s8 a3 = npyv_loada_s8(data0 + vstep * 3);
            npyv_s8 b3 = npyv_loada_s8(data1 + vstep * 3);
            npyv_s8 c3 = npyv_loada_s8(data_out + vstep * 3);
            
            #line 397
            npyv_s8 abc0 = npyv_muladd_s8(a0, b0, c0);
            
#line 397
            npyv_s8 abc1 = npyv_muladd_s8(a1, b1, c1);
            
#line 397
            npyv_s8 abc2 = npyv_muladd_s8(a2, b2, c2);
            
#line 397
            npyv_s8 abc3 = npyv_muladd_s8(a3, b3, c3);
            
            #line 402
            npyv_storea_s8(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_s8(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_s8(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_s8(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s8 a0 = npyv_load_s8(data0 + vstep * 0);
            npyv_s8 b0 = npyv_load_s8(data1 + vstep * 0);
            npyv_s8 c0 = npyv_load_s8(data_out + vstep * 0);
            
#line 390
            npyv_s8 a1 = npyv_load_s8(data0 + vstep * 1);
            npyv_s8 b1 = npyv_load_s8(data1 + vstep * 1);
            npyv_s8 c1 = npyv_load_s8(data_out + vstep * 1);
            
#line 390
            npyv_s8 a2 = npyv_load_s8(data0 + vstep * 2);
            npyv_s8 b2 = npyv_load_s8(data1 + vstep * 2);
            npyv_s8 c2 = npyv_load_s8(data_out + vstep * 2);
            
#line 390
            npyv_s8 a3 = npyv_load_s8(data0 + vstep * 3);
            npyv_s8 b3 = npyv_load_s8(data1 + vstep * 3);
            npyv_s8 c3 = npyv_load_s8(data_out + vstep * 3);
            
            #line 397
            npyv_s8 abc0 = npyv_muladd_s8(a0, b0, c0);
            
#line 397
            npyv_s8 abc1 = npyv_muladd_s8(a1, b1, c1);
            
#line 397
            npyv_s8 abc2 = npyv_muladd_s8(a2, b2, c2);
            
#line 397
            npyv_s8 abc3 = npyv_muladd_s8(a3, b3, c3);
            
            #line 402
            npyv_store_s8(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_s8(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_s8(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_s8(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_s8 a = npyv_load_tillz_s8(data0, count);
        npyv_s8 b = npyv_load_tillz_s8(data1, count);
        npyv_s8 c = npyv_load_tillz_s8(data_out, count);
        npyv_store_till_s8(data_out, count, npyv_muladd_s8(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_byte a0 = (data0[0]);
        const npy_byte b0 = (data1[0]);
        const npy_byte c0 = (data_out[0]);
        
#line 420
        const npy_byte a1 = (data0[1]);
        const npy_byte b1 = (data1[1]);
        const npy_byte c1 = (data_out[1]);
        
#line 420
        const npy_byte a2 = (data0[2]);
        const npy_byte b2 = (data1[2]);
        const npy_byte c2 = (data_out[2]);
        
#line 420
        const npy_byte a3 = (data0[3]);
        const npy_byte b3 = (data1[3]);
        const npy_byte c3 = (data_out[3]);
        
        #line 427
        const npy_byte abc0 = a0 * b0 + c0;
        
#line 427
        const npy_byte abc1 = a1 * b1 + c1;
        
#line 427
        const npy_byte abc2 = a2 * b2 + c2;
        
#line 427
        const npy_byte abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_byte a = (*data0);
        const npy_byte b = (*data1);
        const npy_byte c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_byte

}

/* Some extra specializations for the two operand case */
static void
byte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte value0 = (*(npy_byte *)dataptr[0]);
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data_out = (npy_byte *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    byte_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
byte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte value1 = (*(npy_byte *)dataptr[1]);
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data_out = (npy_byte *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    byte_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
byte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte accum = 0;

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_byte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_s8;
    npyv_s8 vaccum = npyv_zero_s8();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s8 a0 = npyv_loada_s8(data0 + vstep * 0);
            npyv_s8 b0 = npyv_loada_s8(data1 + vstep * 0);
            
#line 501
            npyv_s8 a1 = npyv_loada_s8(data0 + vstep * 1);
            npyv_s8 b1 = npyv_loada_s8(data1 + vstep * 1);
            
#line 501
            npyv_s8 a2 = npyv_loada_s8(data0 + vstep * 2);
            npyv_s8 b2 = npyv_loada_s8(data1 + vstep * 2);
            
#line 501
            npyv_s8 a3 = npyv_loada_s8(data0 + vstep * 3);
            npyv_s8 b3 = npyv_loada_s8(data1 + vstep * 3);
            
            npyv_s8 ab3 = npyv_muladd_s8(a3, b3, vaccum);
            npyv_s8 ab2 = npyv_muladd_s8(a2, b2, ab3);
            npyv_s8 ab1 = npyv_muladd_s8(a1, b1, ab2);
                    vaccum = npyv_muladd_s8(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s8 a0 = npyv_load_s8(data0 + vstep * 0);
            npyv_s8 b0 = npyv_load_s8(data1 + vstep * 0);
            
#line 501
            npyv_s8 a1 = npyv_load_s8(data0 + vstep * 1);
            npyv_s8 b1 = npyv_load_s8(data1 + vstep * 1);
            
#line 501
            npyv_s8 a2 = npyv_load_s8(data0 + vstep * 2);
            npyv_s8 b2 = npyv_load_s8(data1 + vstep * 2);
            
#line 501
            npyv_s8 a3 = npyv_load_s8(data0 + vstep * 3);
            npyv_s8 b3 = npyv_load_s8(data1 + vstep * 3);
            
            npyv_s8 ab3 = npyv_muladd_s8(a3, b3, vaccum);
            npyv_s8 ab2 = npyv_muladd_s8(a2, b2, ab3);
            npyv_s8 ab1 = npyv_muladd_s8(a1, b1, ab2);
                    vaccum = npyv_muladd_s8(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_s8 a = npyv_load_tillz_s8(data0, count);
        npyv_s8 b = npyv_load_tillz_s8(data1, count);
        vaccum = npyv_muladd_s8(a, b, vaccum);
    }
    accum = npyv_sum_s8(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_byte ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_byte ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_byte ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_byte ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_byte a = (*data0);
        const npy_byte b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_byte
    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
byte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte value0 = (*(npy_byte *)dataptr[0]);
    npy_byte accum = byte_sum_of_arr(data1, count);
    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
byte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte value1 = (*(npy_byte *)dataptr[1]);
    npy_byte accum = byte_sum_of_arr(data0, count);
    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + value1 * accum);
}

#elif 1 == 3 && !0

static void
byte_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data2 = (npy_byte *)dataptr[2];
    npy_byte *data_out = (npy_byte *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1 > 3 || @complex */

static void
byte_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        *(npy_byte *)dataptr[nop] = (temp +
                                           (*(npy_byte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_byte);
        }
#else /* complex */
#  if 1 <= 3
#    define _SUMPROD_NOP 1
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_byte);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1 */

#if 1 == 1

static NPY_GCC_OPT_3 void
byte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_byte *data = (npy_byte *)dataptr[0];
    npy_byte accum = byte_sum_of_arr(data, count);
    *((npy_byte *)dataptr[1]) = (accum + (*((npy_byte *)dataptr[1])));
#else
    npy_byte accum_re = 0, accum_im = 0;
    npy_byte *data0 = (npy_byte *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_byte re01 = data0[0] + data0[2];
        const npy_byte re23 = data0[4] + data0[6];
        const npy_byte im13 = data0[1] + data0[3];
        const npy_byte im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_byte *)dataptr[1])[0] += accum_re;
    ((npy_byte *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 1 == 1 */

static void
byte_sum_of_products_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_byte accum_re = 0, accum_im = 0;
#else
    npy_byte accum = 0;
#endif

#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_outstride0_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        accum += (*(npy_byte *)data0);
        data0 += stride0;
#  elif 1 == 2
        accum += (*(npy_byte *)data0) *
                 (*(npy_byte *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1 == 3
        accum += (*(npy_byte *)data0) *
                 (*(npy_byte *)data1) *
                 (*(npy_byte *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        accum_re += ((npy_byte *)data0)[0];
        accum_im += ((npy_byte *)data0)[1];
        data0 += stride0;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1 <= 3
    ((npy_byte *)dataptr[1])[0] += accum_re;
    ((npy_byte *)dataptr[1])[1] += accum_im;
#  else
    ((npy_byte *)dataptr[nop])[0] += accum_re;
    ((npy_byte *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1 <= 3
    *((npy_byte *)dataptr[1]) = (accum +
                                    (*((npy_byte *)dataptr[1])));
#  else
    *((npy_byte *)dataptr[nop]) = (accum +
                                    (*((npy_byte *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
byte_sum_of_products_two(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (2 == 1) || (2 <= 3 && !0)
    char *data_out = dataptr[2];
    npy_intp stride_out = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_two (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        *(npy_byte *)data_out = ((*(npy_byte *)data0) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 2 == 2
        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
                                         (*(npy_byte *)data1) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 2 == 3
        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
                                         (*(npy_byte *)data1) *
                                         (*(npy_byte *)data2) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        *(npy_byte *)dataptr[nop] = (temp +
                                           (*(npy_byte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        ((npy_byte *)data_out)[0] = ((npy_byte *)data0)[0] +
                                         ((npy_byte *)data_out)[0];
        ((npy_byte *)data_out)[1] = ((npy_byte *)data0)[1] +
                                         ((npy_byte *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 2 == 1

static void
byte_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data_out = (npy_byte *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_byte *)data_out + 2*6)[0] =
                                    ((npy_byte *)data0 + 2*6)[0] +
                                    ((npy_byte *)data_out + 2*6)[0];
            ((npy_byte *)data_out + 2*6)[1] =
                                    ((npy_byte *)data0 + 2*6)[1] +
                                    ((npy_byte *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_byte *)data_out + 2*5)[0] =
                                    ((npy_byte *)data0 + 2*5)[0] +
                                    ((npy_byte *)data_out + 2*5)[0];
            ((npy_byte *)data_out + 2*5)[1] =
                                    ((npy_byte *)data0 + 2*5)[1] +
                                    ((npy_byte *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_byte *)data_out + 2*4)[0] =
                                    ((npy_byte *)data0 + 2*4)[0] +
                                    ((npy_byte *)data_out + 2*4)[0];
            ((npy_byte *)data_out + 2*4)[1] =
                                    ((npy_byte *)data0 + 2*4)[1] +
                                    ((npy_byte *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_byte *)data_out + 2*3)[0] =
                                    ((npy_byte *)data0 + 2*3)[0] +
                                    ((npy_byte *)data_out + 2*3)[0];
            ((npy_byte *)data_out + 2*3)[1] =
                                    ((npy_byte *)data0 + 2*3)[1] +
                                    ((npy_byte *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_byte *)data_out + 2*2)[0] =
                                    ((npy_byte *)data0 + 2*2)[0] +
                                    ((npy_byte *)data_out + 2*2)[0];
            ((npy_byte *)data_out + 2*2)[1] =
                                    ((npy_byte *)data0 + 2*2)[1] +
                                    ((npy_byte *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_byte *)data_out + 2*1)[0] =
                                    ((npy_byte *)data0 + 2*1)[0] +
                                    ((npy_byte *)data_out + 2*1)[0];
            ((npy_byte *)data_out + 2*1)[1] =
                                    ((npy_byte *)data0 + 2*1)[1] +
                                    ((npy_byte *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_byte *)data_out + 2*0)[0] =
                                    ((npy_byte *)data0 + 2*0)[0] +
                                    ((npy_byte *)data_out + 2*0)[0];
            ((npy_byte *)data_out + 2*0)[1] =
                                    ((npy_byte *)data0 + 2*0)[1] +
                                    ((npy_byte *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_byte *)data_out + 2*0)[0] =
                                ((npy_byte *)data0 + 2*0)[0] +
                                ((npy_byte *)data_out + 2*0)[0];
        ((npy_byte *)data_out + 2*0)[1] =
                                ((npy_byte *)data0 + 2*0)[1] +
                                ((npy_byte *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_byte *)data_out + 2*1)[0] =
                                ((npy_byte *)data0 + 2*1)[0] +
                                ((npy_byte *)data_out + 2*1)[0];
        ((npy_byte *)data_out + 2*1)[1] =
                                ((npy_byte *)data0 + 2*1)[1] +
                                ((npy_byte *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_byte *)data_out + 2*2)[0] =
                                ((npy_byte *)data0 + 2*2)[0] +
                                ((npy_byte *)data_out + 2*2)[0];
        ((npy_byte *)data_out + 2*2)[1] =
                                ((npy_byte *)data0 + 2*2)[1] +
                                ((npy_byte *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_byte *)data_out + 2*3)[0] =
                                ((npy_byte *)data0 + 2*3)[0] +
                                ((npy_byte *)data_out + 2*3)[0];
        ((npy_byte *)data_out + 2*3)[1] =
                                ((npy_byte *)data0 + 2*3)[1] +
                                ((npy_byte *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_byte *)data_out + 2*4)[0] =
                                ((npy_byte *)data0 + 2*4)[0] +
                                ((npy_byte *)data_out + 2*4)[0];
        ((npy_byte *)data_out + 2*4)[1] =
                                ((npy_byte *)data0 + 2*4)[1] +
                                ((npy_byte *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_byte *)data_out + 2*5)[0] =
                                ((npy_byte *)data0 + 2*5)[0] +
                                ((npy_byte *)data_out + 2*5)[0];
        ((npy_byte *)data_out + 2*5)[1] =
                                ((npy_byte *)data0 + 2*5)[1] +
                                ((npy_byte *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_byte *)data_out + 2*6)[0] =
                                ((npy_byte *)data0 + 2*6)[0] +
                                ((npy_byte *)data_out + 2*6)[0];
        ((npy_byte *)data_out + 2*6)[1] =
                                ((npy_byte *)data0 + 2*6)[1] +
                                ((npy_byte *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_byte *)data_out + 2*7)[0] =
                                ((npy_byte *)data0 + 2*7)[0] +
                                ((npy_byte *)data_out + 2*7)[0];
        ((npy_byte *)data_out + 2*7)[1] =
                                ((npy_byte *)data0 + 2*7)[1] +
                                ((npy_byte *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 2 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
byte_sum_of_products_muladd(npy_byte *data, npy_byte *data_out, npy_byte scalar, npy_intp count)
{
#if 0 // NPYV check for npy_byte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s8;
    const npyv_s8 v_scalar = npyv_setall_s8(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s8 b0 = npyv_loada_s8(data + vstep * 0);
            npyv_s8 c0 = npyv_loada_s8(data_out + vstep * 0);
            
#line 312
            npyv_s8 b1 = npyv_loada_s8(data + vstep * 1);
            npyv_s8 c1 = npyv_loada_s8(data_out + vstep * 1);
            
#line 312
            npyv_s8 b2 = npyv_loada_s8(data + vstep * 2);
            npyv_s8 c2 = npyv_loada_s8(data_out + vstep * 2);
            
#line 312
            npyv_s8 b3 = npyv_loada_s8(data + vstep * 3);
            npyv_s8 c3 = npyv_loada_s8(data_out + vstep * 3);
            
            #line 318
            npyv_s8 abc0 = npyv_muladd_s8(v_scalar, b0, c0);
            
#line 318
            npyv_s8 abc1 = npyv_muladd_s8(v_scalar, b1, c1);
            
#line 318
            npyv_s8 abc2 = npyv_muladd_s8(v_scalar, b2, c2);
            
#line 318
            npyv_s8 abc3 = npyv_muladd_s8(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_s8(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_s8(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_s8(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_s8(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s8 b0 = npyv_load_s8(data + vstep * 0);
            npyv_s8 c0 = npyv_load_s8(data_out + vstep * 0);
            
#line 312
            npyv_s8 b1 = npyv_load_s8(data + vstep * 1);
            npyv_s8 c1 = npyv_load_s8(data_out + vstep * 1);
            
#line 312
            npyv_s8 b2 = npyv_load_s8(data + vstep * 2);
            npyv_s8 c2 = npyv_load_s8(data_out + vstep * 2);
            
#line 312
            npyv_s8 b3 = npyv_load_s8(data + vstep * 3);
            npyv_s8 c3 = npyv_load_s8(data_out + vstep * 3);
            
            #line 318
            npyv_s8 abc0 = npyv_muladd_s8(v_scalar, b0, c0);
            
#line 318
            npyv_s8 abc1 = npyv_muladd_s8(v_scalar, b1, c1);
            
#line 318
            npyv_s8 abc2 = npyv_muladd_s8(v_scalar, b2, c2);
            
#line 318
            npyv_s8 abc3 = npyv_muladd_s8(v_scalar, b3, c3);
            
            #line 323
            npyv_store_s8(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_s8(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_s8(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_s8(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_s8 a = npyv_load_tillz_s8(data, count);
        npyv_s8 b = npyv_load_tillz_s8(data_out, count);
        npyv_store_till_s8(data_out, count, npyv_muladd_s8(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_byte b0 = (data[0]);
        const npy_byte c0 = (data_out[0]);
        
#line 340
        const npy_byte b1 = (data[1]);
        const npy_byte c1 = (data_out[1]);
        
#line 340
        const npy_byte b2 = (data[2]);
        const npy_byte c2 = (data_out[2]);
        
#line 340
        const npy_byte b3 = (data[3]);
        const npy_byte c3 = (data_out[3]);
        
        #line 346
        const npy_byte abc0 = scalar * b0 + c0;
        
#line 346
        const npy_byte abc1 = scalar * b1 + c1;
        
#line 346
        const npy_byte abc2 = scalar * b2 + c2;
        
#line 346
        const npy_byte abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_byte b = (*data);
        const npy_byte c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_byte
}

static void
byte_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data_out = (npy_byte *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_byte
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s8;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s8 a0 = npyv_loada_s8(data0 + vstep * 0);
            npyv_s8 b0 = npyv_loada_s8(data1 + vstep * 0);
            npyv_s8 c0 = npyv_loada_s8(data_out + vstep * 0);
            
#line 390
            npyv_s8 a1 = npyv_loada_s8(data0 + vstep * 1);
            npyv_s8 b1 = npyv_loada_s8(data1 + vstep * 1);
            npyv_s8 c1 = npyv_loada_s8(data_out + vstep * 1);
            
#line 390
            npyv_s8 a2 = npyv_loada_s8(data0 + vstep * 2);
            npyv_s8 b2 = npyv_loada_s8(data1 + vstep * 2);
            npyv_s8 c2 = npyv_loada_s8(data_out + vstep * 2);
            
#line 390
            npyv_s8 a3 = npyv_loada_s8(data0 + vstep * 3);
            npyv_s8 b3 = npyv_loada_s8(data1 + vstep * 3);
            npyv_s8 c3 = npyv_loada_s8(data_out + vstep * 3);
            
            #line 397
            npyv_s8 abc0 = npyv_muladd_s8(a0, b0, c0);
            
#line 397
            npyv_s8 abc1 = npyv_muladd_s8(a1, b1, c1);
            
#line 397
            npyv_s8 abc2 = npyv_muladd_s8(a2, b2, c2);
            
#line 397
            npyv_s8 abc3 = npyv_muladd_s8(a3, b3, c3);
            
            #line 402
            npyv_storea_s8(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_s8(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_s8(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_s8(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s8 a0 = npyv_load_s8(data0 + vstep * 0);
            npyv_s8 b0 = npyv_load_s8(data1 + vstep * 0);
            npyv_s8 c0 = npyv_load_s8(data_out + vstep * 0);
            
#line 390
            npyv_s8 a1 = npyv_load_s8(data0 + vstep * 1);
            npyv_s8 b1 = npyv_load_s8(data1 + vstep * 1);
            npyv_s8 c1 = npyv_load_s8(data_out + vstep * 1);
            
#line 390
            npyv_s8 a2 = npyv_load_s8(data0 + vstep * 2);
            npyv_s8 b2 = npyv_load_s8(data1 + vstep * 2);
            npyv_s8 c2 = npyv_load_s8(data_out + vstep * 2);
            
#line 390
            npyv_s8 a3 = npyv_load_s8(data0 + vstep * 3);
            npyv_s8 b3 = npyv_load_s8(data1 + vstep * 3);
            npyv_s8 c3 = npyv_load_s8(data_out + vstep * 3);
            
            #line 397
            npyv_s8 abc0 = npyv_muladd_s8(a0, b0, c0);
            
#line 397
            npyv_s8 abc1 = npyv_muladd_s8(a1, b1, c1);
            
#line 397
            npyv_s8 abc2 = npyv_muladd_s8(a2, b2, c2);
            
#line 397
            npyv_s8 abc3 = npyv_muladd_s8(a3, b3, c3);
            
            #line 402
            npyv_store_s8(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_s8(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_s8(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_s8(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_s8 a = npyv_load_tillz_s8(data0, count);
        npyv_s8 b = npyv_load_tillz_s8(data1, count);
        npyv_s8 c = npyv_load_tillz_s8(data_out, count);
        npyv_store_till_s8(data_out, count, npyv_muladd_s8(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_byte a0 = (data0[0]);
        const npy_byte b0 = (data1[0]);
        const npy_byte c0 = (data_out[0]);
        
#line 420
        const npy_byte a1 = (data0[1]);
        const npy_byte b1 = (data1[1]);
        const npy_byte c1 = (data_out[1]);
        
#line 420
        const npy_byte a2 = (data0[2]);
        const npy_byte b2 = (data1[2]);
        const npy_byte c2 = (data_out[2]);
        
#line 420
        const npy_byte a3 = (data0[3]);
        const npy_byte b3 = (data1[3]);
        const npy_byte c3 = (data_out[3]);
        
        #line 427
        const npy_byte abc0 = a0 * b0 + c0;
        
#line 427
        const npy_byte abc1 = a1 * b1 + c1;
        
#line 427
        const npy_byte abc2 = a2 * b2 + c2;
        
#line 427
        const npy_byte abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_byte a = (*data0);
        const npy_byte b = (*data1);
        const npy_byte c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_byte

}

/* Some extra specializations for the two operand case */
static void
byte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte value0 = (*(npy_byte *)dataptr[0]);
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data_out = (npy_byte *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    byte_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
byte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte value1 = (*(npy_byte *)dataptr[1]);
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data_out = (npy_byte *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    byte_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
byte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte accum = 0;

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_byte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_s8;
    npyv_s8 vaccum = npyv_zero_s8();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s8 a0 = npyv_loada_s8(data0 + vstep * 0);
            npyv_s8 b0 = npyv_loada_s8(data1 + vstep * 0);
            
#line 501
            npyv_s8 a1 = npyv_loada_s8(data0 + vstep * 1);
            npyv_s8 b1 = npyv_loada_s8(data1 + vstep * 1);
            
#line 501
            npyv_s8 a2 = npyv_loada_s8(data0 + vstep * 2);
            npyv_s8 b2 = npyv_loada_s8(data1 + vstep * 2);
            
#line 501
            npyv_s8 a3 = npyv_loada_s8(data0 + vstep * 3);
            npyv_s8 b3 = npyv_loada_s8(data1 + vstep * 3);
            
            npyv_s8 ab3 = npyv_muladd_s8(a3, b3, vaccum);
            npyv_s8 ab2 = npyv_muladd_s8(a2, b2, ab3);
            npyv_s8 ab1 = npyv_muladd_s8(a1, b1, ab2);
                    vaccum = npyv_muladd_s8(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s8 a0 = npyv_load_s8(data0 + vstep * 0);
            npyv_s8 b0 = npyv_load_s8(data1 + vstep * 0);
            
#line 501
            npyv_s8 a1 = npyv_load_s8(data0 + vstep * 1);
            npyv_s8 b1 = npyv_load_s8(data1 + vstep * 1);
            
#line 501
            npyv_s8 a2 = npyv_load_s8(data0 + vstep * 2);
            npyv_s8 b2 = npyv_load_s8(data1 + vstep * 2);
            
#line 501
            npyv_s8 a3 = npyv_load_s8(data0 + vstep * 3);
            npyv_s8 b3 = npyv_load_s8(data1 + vstep * 3);
            
            npyv_s8 ab3 = npyv_muladd_s8(a3, b3, vaccum);
            npyv_s8 ab2 = npyv_muladd_s8(a2, b2, ab3);
            npyv_s8 ab1 = npyv_muladd_s8(a1, b1, ab2);
                    vaccum = npyv_muladd_s8(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_s8 a = npyv_load_tillz_s8(data0, count);
        npyv_s8 b = npyv_load_tillz_s8(data1, count);
        vaccum = npyv_muladd_s8(a, b, vaccum);
    }
    accum = npyv_sum_s8(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_byte ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_byte ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_byte ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_byte ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_byte a = (*data0);
        const npy_byte b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_byte
    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
byte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte value0 = (*(npy_byte *)dataptr[0]);
    npy_byte accum = byte_sum_of_arr(data1, count);
    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
byte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte value1 = (*(npy_byte *)dataptr[1]);
    npy_byte accum = byte_sum_of_arr(data0, count);
    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + value1 * accum);
}

#elif 2 == 3 && !0

static void
byte_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data2 = (npy_byte *)dataptr[2];
    npy_byte *data_out = (npy_byte *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 2 > 3 || @complex */

static void
byte_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        *(npy_byte *)dataptr[nop] = (temp +
                                           (*(npy_byte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_byte);
        }
#else /* complex */
#  if 2 <= 3
#    define _SUMPROD_NOP 2
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_byte);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 2 */

#if 2 == 1

static NPY_GCC_OPT_3 void
byte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_byte *data = (npy_byte *)dataptr[0];
    npy_byte accum = byte_sum_of_arr(data, count);
    *((npy_byte *)dataptr[1]) = (accum + (*((npy_byte *)dataptr[1])));
#else
    npy_byte accum_re = 0, accum_im = 0;
    npy_byte *data0 = (npy_byte *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_byte re01 = data0[0] + data0[2];
        const npy_byte re23 = data0[4] + data0[6];
        const npy_byte im13 = data0[1] + data0[3];
        const npy_byte im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_byte *)dataptr[1])[0] += accum_re;
    ((npy_byte *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 2 == 1 */

static void
byte_sum_of_products_outstride0_two(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_byte accum_re = 0, accum_im = 0;
#else
    npy_byte accum = 0;
#endif

#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_outstride0_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        accum += (*(npy_byte *)data0);
        data0 += stride0;
#  elif 2 == 2
        accum += (*(npy_byte *)data0) *
                 (*(npy_byte *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 2 == 3
        accum += (*(npy_byte *)data0) *
                 (*(npy_byte *)data1) *
                 (*(npy_byte *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        accum_re += ((npy_byte *)data0)[0];
        accum_im += ((npy_byte *)data0)[1];
        data0 += stride0;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 2 <= 3
    ((npy_byte *)dataptr[2])[0] += accum_re;
    ((npy_byte *)dataptr[2])[1] += accum_im;
#  else
    ((npy_byte *)dataptr[nop])[0] += accum_re;
    ((npy_byte *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 2 <= 3
    *((npy_byte *)dataptr[2]) = (accum +
                                    (*((npy_byte *)dataptr[2])));
#  else
    *((npy_byte *)dataptr[nop]) = (accum +
                                    (*((npy_byte *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
byte_sum_of_products_three(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (3 == 1) || (3 <= 3 && !0)
    char *data_out = dataptr[3];
    npy_intp stride_out = strides[3];
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_three (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        *(npy_byte *)data_out = ((*(npy_byte *)data0) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 3 == 2
        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
                                         (*(npy_byte *)data1) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 3 == 3
        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
                                         (*(npy_byte *)data1) *
                                         (*(npy_byte *)data2) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        *(npy_byte *)dataptr[nop] = (temp +
                                           (*(npy_byte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        ((npy_byte *)data_out)[0] = ((npy_byte *)data0)[0] +
                                         ((npy_byte *)data_out)[0];
        ((npy_byte *)data_out)[1] = ((npy_byte *)data0)[1] +
                                         ((npy_byte *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 3 == 1

static void
byte_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data_out = (npy_byte *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_byte *)data_out + 2*6)[0] =
                                    ((npy_byte *)data0 + 2*6)[0] +
                                    ((npy_byte *)data_out + 2*6)[0];
            ((npy_byte *)data_out + 2*6)[1] =
                                    ((npy_byte *)data0 + 2*6)[1] +
                                    ((npy_byte *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_byte *)data_out + 2*5)[0] =
                                    ((npy_byte *)data0 + 2*5)[0] +
                                    ((npy_byte *)data_out + 2*5)[0];
            ((npy_byte *)data_out + 2*5)[1] =
                                    ((npy_byte *)data0 + 2*5)[1] +
                                    ((npy_byte *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_byte *)data_out + 2*4)[0] =
                                    ((npy_byte *)data0 + 2*4)[0] +
                                    ((npy_byte *)data_out + 2*4)[0];
            ((npy_byte *)data_out + 2*4)[1] =
                                    ((npy_byte *)data0 + 2*4)[1] +
                                    ((npy_byte *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_byte *)data_out + 2*3)[0] =
                                    ((npy_byte *)data0 + 2*3)[0] +
                                    ((npy_byte *)data_out + 2*3)[0];
            ((npy_byte *)data_out + 2*3)[1] =
                                    ((npy_byte *)data0 + 2*3)[1] +
                                    ((npy_byte *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_byte *)data_out + 2*2)[0] =
                                    ((npy_byte *)data0 + 2*2)[0] +
                                    ((npy_byte *)data_out + 2*2)[0];
            ((npy_byte *)data_out + 2*2)[1] =
                                    ((npy_byte *)data0 + 2*2)[1] +
                                    ((npy_byte *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_byte *)data_out + 2*1)[0] =
                                    ((npy_byte *)data0 + 2*1)[0] +
                                    ((npy_byte *)data_out + 2*1)[0];
            ((npy_byte *)data_out + 2*1)[1] =
                                    ((npy_byte *)data0 + 2*1)[1] +
                                    ((npy_byte *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_byte *)data_out + 2*0)[0] =
                                    ((npy_byte *)data0 + 2*0)[0] +
                                    ((npy_byte *)data_out + 2*0)[0];
            ((npy_byte *)data_out + 2*0)[1] =
                                    ((npy_byte *)data0 + 2*0)[1] +
                                    ((npy_byte *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_byte *)data_out + 2*0)[0] =
                                ((npy_byte *)data0 + 2*0)[0] +
                                ((npy_byte *)data_out + 2*0)[0];
        ((npy_byte *)data_out + 2*0)[1] =
                                ((npy_byte *)data0 + 2*0)[1] +
                                ((npy_byte *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_byte *)data_out + 2*1)[0] =
                                ((npy_byte *)data0 + 2*1)[0] +
                                ((npy_byte *)data_out + 2*1)[0];
        ((npy_byte *)data_out + 2*1)[1] =
                                ((npy_byte *)data0 + 2*1)[1] +
                                ((npy_byte *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_byte *)data_out + 2*2)[0] =
                                ((npy_byte *)data0 + 2*2)[0] +
                                ((npy_byte *)data_out + 2*2)[0];
        ((npy_byte *)data_out + 2*2)[1] =
                                ((npy_byte *)data0 + 2*2)[1] +
                                ((npy_byte *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_byte *)data_out + 2*3)[0] =
                                ((npy_byte *)data0 + 2*3)[0] +
                                ((npy_byte *)data_out + 2*3)[0];
        ((npy_byte *)data_out + 2*3)[1] =
                                ((npy_byte *)data0 + 2*3)[1] +
                                ((npy_byte *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_byte *)data_out + 2*4)[0] =
                                ((npy_byte *)data0 + 2*4)[0] +
                                ((npy_byte *)data_out + 2*4)[0];
        ((npy_byte *)data_out + 2*4)[1] =
                                ((npy_byte *)data0 + 2*4)[1] +
                                ((npy_byte *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_byte *)data_out + 2*5)[0] =
                                ((npy_byte *)data0 + 2*5)[0] +
                                ((npy_byte *)data_out + 2*5)[0];
        ((npy_byte *)data_out + 2*5)[1] =
                                ((npy_byte *)data0 + 2*5)[1] +
                                ((npy_byte *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_byte *)data_out + 2*6)[0] =
                                ((npy_byte *)data0 + 2*6)[0] +
                                ((npy_byte *)data_out + 2*6)[0];
        ((npy_byte *)data_out + 2*6)[1] =
                                ((npy_byte *)data0 + 2*6)[1] +
                                ((npy_byte *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_byte *)data_out + 2*7)[0] =
                                ((npy_byte *)data0 + 2*7)[0] +
                                ((npy_byte *)data_out + 2*7)[0];
        ((npy_byte *)data_out + 2*7)[1] =
                                ((npy_byte *)data0 + 2*7)[1] +
                                ((npy_byte *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 3 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
byte_sum_of_products_muladd(npy_byte *data, npy_byte *data_out, npy_byte scalar, npy_intp count)
{
#if 0 // NPYV check for npy_byte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s8;
    const npyv_s8 v_scalar = npyv_setall_s8(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s8 b0 = npyv_loada_s8(data + vstep * 0);
            npyv_s8 c0 = npyv_loada_s8(data_out + vstep * 0);
            
#line 312
            npyv_s8 b1 = npyv_loada_s8(data + vstep * 1);
            npyv_s8 c1 = npyv_loada_s8(data_out + vstep * 1);
            
#line 312
            npyv_s8 b2 = npyv_loada_s8(data + vstep * 2);
            npyv_s8 c2 = npyv_loada_s8(data_out + vstep * 2);
            
#line 312
            npyv_s8 b3 = npyv_loada_s8(data + vstep * 3);
            npyv_s8 c3 = npyv_loada_s8(data_out + vstep * 3);
            
            #line 318
            npyv_s8 abc0 = npyv_muladd_s8(v_scalar, b0, c0);
            
#line 318
            npyv_s8 abc1 = npyv_muladd_s8(v_scalar, b1, c1);
            
#line 318
            npyv_s8 abc2 = npyv_muladd_s8(v_scalar, b2, c2);
            
#line 318
            npyv_s8 abc3 = npyv_muladd_s8(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_s8(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_s8(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_s8(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_s8(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s8 b0 = npyv_load_s8(data + vstep * 0);
            npyv_s8 c0 = npyv_load_s8(data_out + vstep * 0);
            
#line 312
            npyv_s8 b1 = npyv_load_s8(data + vstep * 1);
            npyv_s8 c1 = npyv_load_s8(data_out + vstep * 1);
            
#line 312
            npyv_s8 b2 = npyv_load_s8(data + vstep * 2);
            npyv_s8 c2 = npyv_load_s8(data_out + vstep * 2);
            
#line 312
            npyv_s8 b3 = npyv_load_s8(data + vstep * 3);
            npyv_s8 c3 = npyv_load_s8(data_out + vstep * 3);
            
            #line 318
            npyv_s8 abc0 = npyv_muladd_s8(v_scalar, b0, c0);
            
#line 318
            npyv_s8 abc1 = npyv_muladd_s8(v_scalar, b1, c1);
            
#line 318
            npyv_s8 abc2 = npyv_muladd_s8(v_scalar, b2, c2);
            
#line 318
            npyv_s8 abc3 = npyv_muladd_s8(v_scalar, b3, c3);
            
            #line 323
            npyv_store_s8(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_s8(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_s8(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_s8(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_s8 a = npyv_load_tillz_s8(data, count);
        npyv_s8 b = npyv_load_tillz_s8(data_out, count);
        npyv_store_till_s8(data_out, count, npyv_muladd_s8(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_byte b0 = (data[0]);
        const npy_byte c0 = (data_out[0]);
        
#line 340
        const npy_byte b1 = (data[1]);
        const npy_byte c1 = (data_out[1]);
        
#line 340
        const npy_byte b2 = (data[2]);
        const npy_byte c2 = (data_out[2]);
        
#line 340
        const npy_byte b3 = (data[3]);
        const npy_byte c3 = (data_out[3]);
        
        #line 346
        const npy_byte abc0 = scalar * b0 + c0;
        
#line 346
        const npy_byte abc1 = scalar * b1 + c1;
        
#line 346
        const npy_byte abc2 = scalar * b2 + c2;
        
#line 346
        const npy_byte abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_byte b = (*data);
        const npy_byte c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_byte
}

static void
byte_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data_out = (npy_byte *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_byte
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s8;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s8 a0 = npyv_loada_s8(data0 + vstep * 0);
            npyv_s8 b0 = npyv_loada_s8(data1 + vstep * 0);
            npyv_s8 c0 = npyv_loada_s8(data_out + vstep * 0);
            
#line 390
            npyv_s8 a1 = npyv_loada_s8(data0 + vstep * 1);
            npyv_s8 b1 = npyv_loada_s8(data1 + vstep * 1);
            npyv_s8 c1 = npyv_loada_s8(data_out + vstep * 1);
            
#line 390
            npyv_s8 a2 = npyv_loada_s8(data0 + vstep * 2);
            npyv_s8 b2 = npyv_loada_s8(data1 + vstep * 2);
            npyv_s8 c2 = npyv_loada_s8(data_out + vstep * 2);
            
#line 390
            npyv_s8 a3 = npyv_loada_s8(data0 + vstep * 3);
            npyv_s8 b3 = npyv_loada_s8(data1 + vstep * 3);
            npyv_s8 c3 = npyv_loada_s8(data_out + vstep * 3);
            
            #line 397
            npyv_s8 abc0 = npyv_muladd_s8(a0, b0, c0);
            
#line 397
            npyv_s8 abc1 = npyv_muladd_s8(a1, b1, c1);
            
#line 397
            npyv_s8 abc2 = npyv_muladd_s8(a2, b2, c2);
            
#line 397
            npyv_s8 abc3 = npyv_muladd_s8(a3, b3, c3);
            
            #line 402
            npyv_storea_s8(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_s8(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_s8(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_s8(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s8 a0 = npyv_load_s8(data0 + vstep * 0);
            npyv_s8 b0 = npyv_load_s8(data1 + vstep * 0);
            npyv_s8 c0 = npyv_load_s8(data_out + vstep * 0);
            
#line 390
            npyv_s8 a1 = npyv_load_s8(data0 + vstep * 1);
            npyv_s8 b1 = npyv_load_s8(data1 + vstep * 1);
            npyv_s8 c1 = npyv_load_s8(data_out + vstep * 1);
            
#line 390
            npyv_s8 a2 = npyv_load_s8(data0 + vstep * 2);
            npyv_s8 b2 = npyv_load_s8(data1 + vstep * 2);
            npyv_s8 c2 = npyv_load_s8(data_out + vstep * 2);
            
#line 390
            npyv_s8 a3 = npyv_load_s8(data0 + vstep * 3);
            npyv_s8 b3 = npyv_load_s8(data1 + vstep * 3);
            npyv_s8 c3 = npyv_load_s8(data_out + vstep * 3);
            
            #line 397
            npyv_s8 abc0 = npyv_muladd_s8(a0, b0, c0);
            
#line 397
            npyv_s8 abc1 = npyv_muladd_s8(a1, b1, c1);
            
#line 397
            npyv_s8 abc2 = npyv_muladd_s8(a2, b2, c2);
            
#line 397
            npyv_s8 abc3 = npyv_muladd_s8(a3, b3, c3);
            
            #line 402
            npyv_store_s8(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_s8(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_s8(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_s8(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_s8 a = npyv_load_tillz_s8(data0, count);
        npyv_s8 b = npyv_load_tillz_s8(data1, count);
        npyv_s8 c = npyv_load_tillz_s8(data_out, count);
        npyv_store_till_s8(data_out, count, npyv_muladd_s8(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_byte a0 = (data0[0]);
        const npy_byte b0 = (data1[0]);
        const npy_byte c0 = (data_out[0]);
        
#line 420
        const npy_byte a1 = (data0[1]);
        const npy_byte b1 = (data1[1]);
        const npy_byte c1 = (data_out[1]);
        
#line 420
        const npy_byte a2 = (data0[2]);
        const npy_byte b2 = (data1[2]);
        const npy_byte c2 = (data_out[2]);
        
#line 420
        const npy_byte a3 = (data0[3]);
        const npy_byte b3 = (data1[3]);
        const npy_byte c3 = (data_out[3]);
        
        #line 427
        const npy_byte abc0 = a0 * b0 + c0;
        
#line 427
        const npy_byte abc1 = a1 * b1 + c1;
        
#line 427
        const npy_byte abc2 = a2 * b2 + c2;
        
#line 427
        const npy_byte abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_byte a = (*data0);
        const npy_byte b = (*data1);
        const npy_byte c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_byte

}

/* Some extra specializations for the two operand case */
static void
byte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte value0 = (*(npy_byte *)dataptr[0]);
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data_out = (npy_byte *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    byte_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
byte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte value1 = (*(npy_byte *)dataptr[1]);
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data_out = (npy_byte *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    byte_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
byte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte accum = 0;

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_byte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_s8;
    npyv_s8 vaccum = npyv_zero_s8();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s8 a0 = npyv_loada_s8(data0 + vstep * 0);
            npyv_s8 b0 = npyv_loada_s8(data1 + vstep * 0);
            
#line 501
            npyv_s8 a1 = npyv_loada_s8(data0 + vstep * 1);
            npyv_s8 b1 = npyv_loada_s8(data1 + vstep * 1);
            
#line 501
            npyv_s8 a2 = npyv_loada_s8(data0 + vstep * 2);
            npyv_s8 b2 = npyv_loada_s8(data1 + vstep * 2);
            
#line 501
            npyv_s8 a3 = npyv_loada_s8(data0 + vstep * 3);
            npyv_s8 b3 = npyv_loada_s8(data1 + vstep * 3);
            
            npyv_s8 ab3 = npyv_muladd_s8(a3, b3, vaccum);
            npyv_s8 ab2 = npyv_muladd_s8(a2, b2, ab3);
            npyv_s8 ab1 = npyv_muladd_s8(a1, b1, ab2);
                    vaccum = npyv_muladd_s8(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s8 a0 = npyv_load_s8(data0 + vstep * 0);
            npyv_s8 b0 = npyv_load_s8(data1 + vstep * 0);
            
#line 501
            npyv_s8 a1 = npyv_load_s8(data0 + vstep * 1);
            npyv_s8 b1 = npyv_load_s8(data1 + vstep * 1);
            
#line 501
            npyv_s8 a2 = npyv_load_s8(data0 + vstep * 2);
            npyv_s8 b2 = npyv_load_s8(data1 + vstep * 2);
            
#line 501
            npyv_s8 a3 = npyv_load_s8(data0 + vstep * 3);
            npyv_s8 b3 = npyv_load_s8(data1 + vstep * 3);
            
            npyv_s8 ab3 = npyv_muladd_s8(a3, b3, vaccum);
            npyv_s8 ab2 = npyv_muladd_s8(a2, b2, ab3);
            npyv_s8 ab1 = npyv_muladd_s8(a1, b1, ab2);
                    vaccum = npyv_muladd_s8(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_s8 a = npyv_load_tillz_s8(data0, count);
        npyv_s8 b = npyv_load_tillz_s8(data1, count);
        vaccum = npyv_muladd_s8(a, b, vaccum);
    }
    accum = npyv_sum_s8(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_byte ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_byte ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_byte ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_byte ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_byte a = (*data0);
        const npy_byte b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_byte
    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
byte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte value0 = (*(npy_byte *)dataptr[0]);
    npy_byte accum = byte_sum_of_arr(data1, count);
    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
byte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte value1 = (*(npy_byte *)dataptr[1]);
    npy_byte accum = byte_sum_of_arr(data0, count);
    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + value1 * accum);
}

#elif 3 == 3 && !0

static void
byte_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data2 = (npy_byte *)dataptr[2];
    npy_byte *data_out = (npy_byte *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 3 > 3 || @complex */

static void
byte_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        *(npy_byte *)dataptr[nop] = (temp +
                                           (*(npy_byte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_byte);
        }
#else /* complex */
#  if 3 <= 3
#    define _SUMPROD_NOP 3
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_byte);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 3 */

#if 3 == 1

static NPY_GCC_OPT_3 void
byte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_byte *data = (npy_byte *)dataptr[0];
    npy_byte accum = byte_sum_of_arr(data, count);
    *((npy_byte *)dataptr[1]) = (accum + (*((npy_byte *)dataptr[1])));
#else
    npy_byte accum_re = 0, accum_im = 0;
    npy_byte *data0 = (npy_byte *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_byte re01 = data0[0] + data0[2];
        const npy_byte re23 = data0[4] + data0[6];
        const npy_byte im13 = data0[1] + data0[3];
        const npy_byte im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_byte *)dataptr[1])[0] += accum_re;
    ((npy_byte *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 3 == 1 */

static void
byte_sum_of_products_outstride0_three(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_byte accum_re = 0, accum_im = 0;
#else
    npy_byte accum = 0;
#endif

#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_outstride0_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        accum += (*(npy_byte *)data0);
        data0 += stride0;
#  elif 3 == 2
        accum += (*(npy_byte *)data0) *
                 (*(npy_byte *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 3 == 3
        accum += (*(npy_byte *)data0) *
                 (*(npy_byte *)data1) *
                 (*(npy_byte *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        accum_re += ((npy_byte *)data0)[0];
        accum_im += ((npy_byte *)data0)[1];
        data0 += stride0;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 3 <= 3
    ((npy_byte *)dataptr[3])[0] += accum_re;
    ((npy_byte *)dataptr[3])[1] += accum_im;
#  else
    ((npy_byte *)dataptr[nop])[0] += accum_re;
    ((npy_byte *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 3 <= 3
    *((npy_byte *)dataptr[3]) = (accum +
                                    (*((npy_byte *)dataptr[3])));
#  else
    *((npy_byte *)dataptr[nop]) = (accum +
                                    (*((npy_byte *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
byte_sum_of_products_any(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data_out = dataptr[1000];
    npy_intp stride_out = strides[1000];
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_any (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        *(npy_byte *)data_out = ((*(npy_byte *)data0) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1000 == 2
        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
                                         (*(npy_byte *)data1) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1000 == 3
        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
                                         (*(npy_byte *)data1) *
                                         (*(npy_byte *)data2) +
                                         (*(npy_byte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        *(npy_byte *)dataptr[nop] = (temp +
                                           (*(npy_byte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        ((npy_byte *)data_out)[0] = ((npy_byte *)data0)[0] +
                                         ((npy_byte *)data_out)[0];
        ((npy_byte *)data_out)[1] = ((npy_byte *)data0)[1] +
                                         ((npy_byte *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1000 == 1

static void
byte_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data_out = (npy_byte *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_byte *)data_out + 2*6)[0] =
                                    ((npy_byte *)data0 + 2*6)[0] +
                                    ((npy_byte *)data_out + 2*6)[0];
            ((npy_byte *)data_out + 2*6)[1] =
                                    ((npy_byte *)data0 + 2*6)[1] +
                                    ((npy_byte *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_byte *)data_out + 2*5)[0] =
                                    ((npy_byte *)data0 + 2*5)[0] +
                                    ((npy_byte *)data_out + 2*5)[0];
            ((npy_byte *)data_out + 2*5)[1] =
                                    ((npy_byte *)data0 + 2*5)[1] +
                                    ((npy_byte *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_byte *)data_out + 2*4)[0] =
                                    ((npy_byte *)data0 + 2*4)[0] +
                                    ((npy_byte *)data_out + 2*4)[0];
            ((npy_byte *)data_out + 2*4)[1] =
                                    ((npy_byte *)data0 + 2*4)[1] +
                                    ((npy_byte *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_byte *)data_out + 2*3)[0] =
                                    ((npy_byte *)data0 + 2*3)[0] +
                                    ((npy_byte *)data_out + 2*3)[0];
            ((npy_byte *)data_out + 2*3)[1] =
                                    ((npy_byte *)data0 + 2*3)[1] +
                                    ((npy_byte *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_byte *)data_out + 2*2)[0] =
                                    ((npy_byte *)data0 + 2*2)[0] +
                                    ((npy_byte *)data_out + 2*2)[0];
            ((npy_byte *)data_out + 2*2)[1] =
                                    ((npy_byte *)data0 + 2*2)[1] +
                                    ((npy_byte *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_byte *)data_out + 2*1)[0] =
                                    ((npy_byte *)data0 + 2*1)[0] +
                                    ((npy_byte *)data_out + 2*1)[0];
            ((npy_byte *)data_out + 2*1)[1] =
                                    ((npy_byte *)data0 + 2*1)[1] +
                                    ((npy_byte *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_byte *)data_out + 2*0)[0] =
                                    ((npy_byte *)data0 + 2*0)[0] +
                                    ((npy_byte *)data_out + 2*0)[0];
            ((npy_byte *)data_out + 2*0)[1] =
                                    ((npy_byte *)data0 + 2*0)[1] +
                                    ((npy_byte *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_byte *)data_out + 2*0)[0] =
                                ((npy_byte *)data0 + 2*0)[0] +
                                ((npy_byte *)data_out + 2*0)[0];
        ((npy_byte *)data_out + 2*0)[1] =
                                ((npy_byte *)data0 + 2*0)[1] +
                                ((npy_byte *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_byte *)data_out + 2*1)[0] =
                                ((npy_byte *)data0 + 2*1)[0] +
                                ((npy_byte *)data_out + 2*1)[0];
        ((npy_byte *)data_out + 2*1)[1] =
                                ((npy_byte *)data0 + 2*1)[1] +
                                ((npy_byte *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_byte *)data_out + 2*2)[0] =
                                ((npy_byte *)data0 + 2*2)[0] +
                                ((npy_byte *)data_out + 2*2)[0];
        ((npy_byte *)data_out + 2*2)[1] =
                                ((npy_byte *)data0 + 2*2)[1] +
                                ((npy_byte *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_byte *)data_out + 2*3)[0] =
                                ((npy_byte *)data0 + 2*3)[0] +
                                ((npy_byte *)data_out + 2*3)[0];
        ((npy_byte *)data_out + 2*3)[1] =
                                ((npy_byte *)data0 + 2*3)[1] +
                                ((npy_byte *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_byte *)data_out + 2*4)[0] =
                                ((npy_byte *)data0 + 2*4)[0] +
                                ((npy_byte *)data_out + 2*4)[0];
        ((npy_byte *)data_out + 2*4)[1] =
                                ((npy_byte *)data0 + 2*4)[1] +
                                ((npy_byte *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_byte *)data_out + 2*5)[0] =
                                ((npy_byte *)data0 + 2*5)[0] +
                                ((npy_byte *)data_out + 2*5)[0];
        ((npy_byte *)data_out + 2*5)[1] =
                                ((npy_byte *)data0 + 2*5)[1] +
                                ((npy_byte *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_byte *)data_out + 2*6)[0] =
                                ((npy_byte *)data0 + 2*6)[0] +
                                ((npy_byte *)data_out + 2*6)[0];
        ((npy_byte *)data_out + 2*6)[1] =
                                ((npy_byte *)data0 + 2*6)[1] +
                                ((npy_byte *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_byte *)data_out + 2*7)[0] =
                                ((npy_byte *)data0 + 2*7)[0] +
                                ((npy_byte *)data_out + 2*7)[0];
        ((npy_byte *)data_out + 2*7)[1] =
                                ((npy_byte *)data0 + 2*7)[1] +
                                ((npy_byte *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1000 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
byte_sum_of_products_muladd(npy_byte *data, npy_byte *data_out, npy_byte scalar, npy_intp count)
{
#if 0 // NPYV check for npy_byte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s8;
    const npyv_s8 v_scalar = npyv_setall_s8(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s8 b0 = npyv_loada_s8(data + vstep * 0);
            npyv_s8 c0 = npyv_loada_s8(data_out + vstep * 0);
            
#line 312
            npyv_s8 b1 = npyv_loada_s8(data + vstep * 1);
            npyv_s8 c1 = npyv_loada_s8(data_out + vstep * 1);
            
#line 312
            npyv_s8 b2 = npyv_loada_s8(data + vstep * 2);
            npyv_s8 c2 = npyv_loada_s8(data_out + vstep * 2);
            
#line 312
            npyv_s8 b3 = npyv_loada_s8(data + vstep * 3);
            npyv_s8 c3 = npyv_loada_s8(data_out + vstep * 3);
            
            #line 318
            npyv_s8 abc0 = npyv_muladd_s8(v_scalar, b0, c0);
            
#line 318
            npyv_s8 abc1 = npyv_muladd_s8(v_scalar, b1, c1);
            
#line 318
            npyv_s8 abc2 = npyv_muladd_s8(v_scalar, b2, c2);
            
#line 318
            npyv_s8 abc3 = npyv_muladd_s8(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_s8(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_s8(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_s8(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_s8(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s8 b0 = npyv_load_s8(data + vstep * 0);
            npyv_s8 c0 = npyv_load_s8(data_out + vstep * 0);
            
#line 312
            npyv_s8 b1 = npyv_load_s8(data + vstep * 1);
            npyv_s8 c1 = npyv_load_s8(data_out + vstep * 1);
            
#line 312
            npyv_s8 b2 = npyv_load_s8(data + vstep * 2);
            npyv_s8 c2 = npyv_load_s8(data_out + vstep * 2);
            
#line 312
            npyv_s8 b3 = npyv_load_s8(data + vstep * 3);
            npyv_s8 c3 = npyv_load_s8(data_out + vstep * 3);
            
            #line 318
            npyv_s8 abc0 = npyv_muladd_s8(v_scalar, b0, c0);
            
#line 318
            npyv_s8 abc1 = npyv_muladd_s8(v_scalar, b1, c1);
            
#line 318
            npyv_s8 abc2 = npyv_muladd_s8(v_scalar, b2, c2);
            
#line 318
            npyv_s8 abc3 = npyv_muladd_s8(v_scalar, b3, c3);
            
            #line 323
            npyv_store_s8(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_s8(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_s8(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_s8(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_s8 a = npyv_load_tillz_s8(data, count);
        npyv_s8 b = npyv_load_tillz_s8(data_out, count);
        npyv_store_till_s8(data_out, count, npyv_muladd_s8(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_byte b0 = (data[0]);
        const npy_byte c0 = (data_out[0]);
        
#line 340
        const npy_byte b1 = (data[1]);
        const npy_byte c1 = (data_out[1]);
        
#line 340
        const npy_byte b2 = (data[2]);
        const npy_byte c2 = (data_out[2]);
        
#line 340
        const npy_byte b3 = (data[3]);
        const npy_byte c3 = (data_out[3]);
        
        #line 346
        const npy_byte abc0 = scalar * b0 + c0;
        
#line 346
        const npy_byte abc1 = scalar * b1 + c1;
        
#line 346
        const npy_byte abc2 = scalar * b2 + c2;
        
#line 346
        const npy_byte abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_byte b = (*data);
        const npy_byte c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_byte
}

static void
byte_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data_out = (npy_byte *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_byte
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s8;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s8 a0 = npyv_loada_s8(data0 + vstep * 0);
            npyv_s8 b0 = npyv_loada_s8(data1 + vstep * 0);
            npyv_s8 c0 = npyv_loada_s8(data_out + vstep * 0);
            
#line 390
            npyv_s8 a1 = npyv_loada_s8(data0 + vstep * 1);
            npyv_s8 b1 = npyv_loada_s8(data1 + vstep * 1);
            npyv_s8 c1 = npyv_loada_s8(data_out + vstep * 1);
            
#line 390
            npyv_s8 a2 = npyv_loada_s8(data0 + vstep * 2);
            npyv_s8 b2 = npyv_loada_s8(data1 + vstep * 2);
            npyv_s8 c2 = npyv_loada_s8(data_out + vstep * 2);
            
#line 390
            npyv_s8 a3 = npyv_loada_s8(data0 + vstep * 3);
            npyv_s8 b3 = npyv_loada_s8(data1 + vstep * 3);
            npyv_s8 c3 = npyv_loada_s8(data_out + vstep * 3);
            
            #line 397
            npyv_s8 abc0 = npyv_muladd_s8(a0, b0, c0);
            
#line 397
            npyv_s8 abc1 = npyv_muladd_s8(a1, b1, c1);
            
#line 397
            npyv_s8 abc2 = npyv_muladd_s8(a2, b2, c2);
            
#line 397
            npyv_s8 abc3 = npyv_muladd_s8(a3, b3, c3);
            
            #line 402
            npyv_storea_s8(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_s8(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_s8(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_s8(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s8 a0 = npyv_load_s8(data0 + vstep * 0);
            npyv_s8 b0 = npyv_load_s8(data1 + vstep * 0);
            npyv_s8 c0 = npyv_load_s8(data_out + vstep * 0);
            
#line 390
            npyv_s8 a1 = npyv_load_s8(data0 + vstep * 1);
            npyv_s8 b1 = npyv_load_s8(data1 + vstep * 1);
            npyv_s8 c1 = npyv_load_s8(data_out + vstep * 1);
            
#line 390
            npyv_s8 a2 = npyv_load_s8(data0 + vstep * 2);
            npyv_s8 b2 = npyv_load_s8(data1 + vstep * 2);
            npyv_s8 c2 = npyv_load_s8(data_out + vstep * 2);
            
#line 390
            npyv_s8 a3 = npyv_load_s8(data0 + vstep * 3);
            npyv_s8 b3 = npyv_load_s8(data1 + vstep * 3);
            npyv_s8 c3 = npyv_load_s8(data_out + vstep * 3);
            
            #line 397
            npyv_s8 abc0 = npyv_muladd_s8(a0, b0, c0);
            
#line 397
            npyv_s8 abc1 = npyv_muladd_s8(a1, b1, c1);
            
#line 397
            npyv_s8 abc2 = npyv_muladd_s8(a2, b2, c2);
            
#line 397
            npyv_s8 abc3 = npyv_muladd_s8(a3, b3, c3);
            
            #line 402
            npyv_store_s8(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_s8(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_s8(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_s8(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_s8 a = npyv_load_tillz_s8(data0, count);
        npyv_s8 b = npyv_load_tillz_s8(data1, count);
        npyv_s8 c = npyv_load_tillz_s8(data_out, count);
        npyv_store_till_s8(data_out, count, npyv_muladd_s8(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_byte a0 = (data0[0]);
        const npy_byte b0 = (data1[0]);
        const npy_byte c0 = (data_out[0]);
        
#line 420
        const npy_byte a1 = (data0[1]);
        const npy_byte b1 = (data1[1]);
        const npy_byte c1 = (data_out[1]);
        
#line 420
        const npy_byte a2 = (data0[2]);
        const npy_byte b2 = (data1[2]);
        const npy_byte c2 = (data_out[2]);
        
#line 420
        const npy_byte a3 = (data0[3]);
        const npy_byte b3 = (data1[3]);
        const npy_byte c3 = (data_out[3]);
        
        #line 427
        const npy_byte abc0 = a0 * b0 + c0;
        
#line 427
        const npy_byte abc1 = a1 * b1 + c1;
        
#line 427
        const npy_byte abc2 = a2 * b2 + c2;
        
#line 427
        const npy_byte abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_byte a = (*data0);
        const npy_byte b = (*data1);
        const npy_byte c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_byte

}

/* Some extra specializations for the two operand case */
static void
byte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte value0 = (*(npy_byte *)dataptr[0]);
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data_out = (npy_byte *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    byte_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
byte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte value1 = (*(npy_byte *)dataptr[1]);
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data_out = (npy_byte *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    byte_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
byte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte accum = 0;

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_byte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_s8;
    npyv_s8 vaccum = npyv_zero_s8();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s8 a0 = npyv_loada_s8(data0 + vstep * 0);
            npyv_s8 b0 = npyv_loada_s8(data1 + vstep * 0);
            
#line 501
            npyv_s8 a1 = npyv_loada_s8(data0 + vstep * 1);
            npyv_s8 b1 = npyv_loada_s8(data1 + vstep * 1);
            
#line 501
            npyv_s8 a2 = npyv_loada_s8(data0 + vstep * 2);
            npyv_s8 b2 = npyv_loada_s8(data1 + vstep * 2);
            
#line 501
            npyv_s8 a3 = npyv_loada_s8(data0 + vstep * 3);
            npyv_s8 b3 = npyv_loada_s8(data1 + vstep * 3);
            
            npyv_s8 ab3 = npyv_muladd_s8(a3, b3, vaccum);
            npyv_s8 ab2 = npyv_muladd_s8(a2, b2, ab3);
            npyv_s8 ab1 = npyv_muladd_s8(a1, b1, ab2);
                    vaccum = npyv_muladd_s8(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s8 a0 = npyv_load_s8(data0 + vstep * 0);
            npyv_s8 b0 = npyv_load_s8(data1 + vstep * 0);
            
#line 501
            npyv_s8 a1 = npyv_load_s8(data0 + vstep * 1);
            npyv_s8 b1 = npyv_load_s8(data1 + vstep * 1);
            
#line 501
            npyv_s8 a2 = npyv_load_s8(data0 + vstep * 2);
            npyv_s8 b2 = npyv_load_s8(data1 + vstep * 2);
            
#line 501
            npyv_s8 a3 = npyv_load_s8(data0 + vstep * 3);
            npyv_s8 b3 = npyv_load_s8(data1 + vstep * 3);
            
            npyv_s8 ab3 = npyv_muladd_s8(a3, b3, vaccum);
            npyv_s8 ab2 = npyv_muladd_s8(a2, b2, ab3);
            npyv_s8 ab1 = npyv_muladd_s8(a1, b1, ab2);
                    vaccum = npyv_muladd_s8(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_s8 a = npyv_load_tillz_s8(data0, count);
        npyv_s8 b = npyv_load_tillz_s8(data1, count);
        vaccum = npyv_muladd_s8(a, b, vaccum);
    }
    accum = npyv_sum_s8(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_byte ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_byte ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_byte ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_byte ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_byte a = (*data0);
        const npy_byte b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_byte
    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
byte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte value0 = (*(npy_byte *)dataptr[0]);
    npy_byte accum = byte_sum_of_arr(data1, count);
    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
byte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte value1 = (*(npy_byte *)dataptr[1]);
    npy_byte accum = byte_sum_of_arr(data0, count);
    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + value1 * accum);
}

#elif 1000 == 3 && !0

static void
byte_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_byte *data0 = (npy_byte *)dataptr[0];
    npy_byte *data1 = (npy_byte *)dataptr[1];
    npy_byte *data2 = (npy_byte *)dataptr[2];
    npy_byte *data_out = (npy_byte *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1000 > 3 || @complex */

static void
byte_sum_of_products_contig_any(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        *(npy_byte *)dataptr[nop] = (temp +
                                           (*(npy_byte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_byte);
        }
#else /* complex */
#  if 1000 <= 3
#    define _SUMPROD_NOP 1000
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_byte);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1000 */

#if 1000 == 1

static NPY_GCC_OPT_3 void
byte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_byte *data = (npy_byte *)dataptr[0];
    npy_byte accum = byte_sum_of_arr(data, count);
    *((npy_byte *)dataptr[1]) = (accum + (*((npy_byte *)dataptr[1])));
#else
    npy_byte accum_re = 0, accum_im = 0;
    npy_byte *data0 = (npy_byte *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_byte re01 = data0[0] + data0[2];
        const npy_byte re23 = data0[4] + data0[6];
        const npy_byte im13 = data0[1] + data0[3];
        const npy_byte im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_byte *)dataptr[1])[0] += accum_re;
    ((npy_byte *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 1000 == 1 */

static void
byte_sum_of_products_outstride0_any(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_byte accum_re = 0, accum_im = 0;
#else
    npy_byte accum = 0;
#endif

#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_outstride0_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        accum += (*(npy_byte *)data0);
        data0 += stride0;
#  elif 1000 == 2
        accum += (*(npy_byte *)data0) *
                 (*(npy_byte *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1000 == 3
        accum += (*(npy_byte *)data0) *
                 (*(npy_byte *)data1) *
                 (*(npy_byte *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_byte temp = (*(npy_byte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_byte *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        accum_re += ((npy_byte *)data0)[0];
        accum_im += ((npy_byte *)data0)[1];
        data0 += stride0;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_byte re, im, tmp;
        int i;
        re = ((npy_byte *)dataptr[0])[0];
        im = ((npy_byte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_byte *)dataptr[i])[0] -
                  im * ((npy_byte *)dataptr[i])[1];
            im = re * ((npy_byte *)dataptr[i])[1] +
                 im * ((npy_byte *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1000 <= 3
    ((npy_byte *)dataptr[1000])[0] += accum_re;
    ((npy_byte *)dataptr[1000])[1] += accum_im;
#  else
    ((npy_byte *)dataptr[nop])[0] += accum_re;
    ((npy_byte *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1000 <= 3
    *((npy_byte *)dataptr[1000]) = (accum +
                                    (*((npy_byte *)dataptr[1000])));
#  else
    *((npy_byte *)dataptr[nop]) = (accum +
                                    (*((npy_byte *)dataptr[nop])));
#  endif
#endif

}




#line 74

#if !0
static NPY_GCC_OPT_3 npy_short short_sum_of_arr(npy_short *data, npy_intp count)
{
    npy_short accum = 0;
#if 0 // NPYV check for npy_short
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data);
    const int vstep = npyv_nlanes_s16;
    npyv_s16 vaccum = npyv_zero_s16();
    const npy_intp vstepx4 = vstep * 4;

    #line 91
    if(is_aligned) {
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
            #line 96
            npyv_s16 a0 = npyv_loada_s16(data + vstep * 0);
            
#line 96
            npyv_s16 a1 = npyv_loada_s16(data + vstep * 1);
            
#line 96
            npyv_s16 a2 = npyv_loada_s16(data + vstep * 2);
            
#line 96
            npyv_s16 a3 = npyv_loada_s16(data + vstep * 3);
            
            npyv_s16 a01   = npyv_add_s16(a0, a1);
            npyv_s16 a23   = npyv_add_s16(a2, a3);
            npyv_s16 a0123 = npyv_add_s16(a01, a23);
                      vaccum = npyv_add_s16(a0123, vaccum);
        }
    }
    
#line 91
    else {
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
            #line 96
            npyv_s16 a0 = npyv_load_s16(data + vstep * 0);
            
#line 96
            npyv_s16 a1 = npyv_load_s16(data + vstep * 1);
            
#line 96
            npyv_s16 a2 = npyv_load_s16(data + vstep * 2);
            
#line 96
            npyv_s16 a3 = npyv_load_s16(data + vstep * 3);
            
            npyv_s16 a01   = npyv_add_s16(a0, a1);
            npyv_s16 a23   = npyv_add_s16(a2, a3);
            npyv_s16 a0123 = npyv_add_s16(a01, a23);
                      vaccum = npyv_add_s16(a0123, vaccum);
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep) {
        npyv_s16 a = npyv_load_tillz_s16(data, count);
        vaccum = npyv_add_s16(a, vaccum);
    }
    accum = npyv_sum_s16(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data += 4) {
        const npy_short a01 = (*data) + (data[1]);
        const npy_short a23 = (data[2]) + (data[3]);
        accum +=  a01 + a23;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data++) {
        accum += (*data);
    }
#endif // NPYV check for npy_short
    return accum;
}
#endif

#line 131
static void
short_sum_of_products_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1 == 1) || (1 <= 3 && !0)
    char *data_out = dataptr[1];
    npy_intp stride_out = strides[1];
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_one (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        *(npy_short *)data_out = ((*(npy_short *)data0) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1 == 2
        *(npy_short *)data_out = ((*(npy_short *)data0) *
                                         (*(npy_short *)data1) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1 == 3
        *(npy_short *)data_out = ((*(npy_short *)data0) *
                                         (*(npy_short *)data1) *
                                         (*(npy_short *)data2) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        *(npy_short *)dataptr[nop] = (temp +
                                           (*(npy_short *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        ((npy_short *)data_out)[0] = ((npy_short *)data0)[0] +
                                         ((npy_short *)data_out)[0];
        ((npy_short *)data_out)[1] = ((npy_short *)data0)[1] +
                                         ((npy_short *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1 == 1

static void
short_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data_out = (npy_short *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_short *)data_out + 2*6)[0] =
                                    ((npy_short *)data0 + 2*6)[0] +
                                    ((npy_short *)data_out + 2*6)[0];
            ((npy_short *)data_out + 2*6)[1] =
                                    ((npy_short *)data0 + 2*6)[1] +
                                    ((npy_short *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_short *)data_out + 2*5)[0] =
                                    ((npy_short *)data0 + 2*5)[0] +
                                    ((npy_short *)data_out + 2*5)[0];
            ((npy_short *)data_out + 2*5)[1] =
                                    ((npy_short *)data0 + 2*5)[1] +
                                    ((npy_short *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_short *)data_out + 2*4)[0] =
                                    ((npy_short *)data0 + 2*4)[0] +
                                    ((npy_short *)data_out + 2*4)[0];
            ((npy_short *)data_out + 2*4)[1] =
                                    ((npy_short *)data0 + 2*4)[1] +
                                    ((npy_short *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_short *)data_out + 2*3)[0] =
                                    ((npy_short *)data0 + 2*3)[0] +
                                    ((npy_short *)data_out + 2*3)[0];
            ((npy_short *)data_out + 2*3)[1] =
                                    ((npy_short *)data0 + 2*3)[1] +
                                    ((npy_short *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_short *)data_out + 2*2)[0] =
                                    ((npy_short *)data0 + 2*2)[0] +
                                    ((npy_short *)data_out + 2*2)[0];
            ((npy_short *)data_out + 2*2)[1] =
                                    ((npy_short *)data0 + 2*2)[1] +
                                    ((npy_short *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_short *)data_out + 2*1)[0] =
                                    ((npy_short *)data0 + 2*1)[0] +
                                    ((npy_short *)data_out + 2*1)[0];
            ((npy_short *)data_out + 2*1)[1] =
                                    ((npy_short *)data0 + 2*1)[1] +
                                    ((npy_short *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_short *)data_out + 2*0)[0] =
                                    ((npy_short *)data0 + 2*0)[0] +
                                    ((npy_short *)data_out + 2*0)[0];
            ((npy_short *)data_out + 2*0)[1] =
                                    ((npy_short *)data0 + 2*0)[1] +
                                    ((npy_short *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_short *)data_out + 2*0)[0] =
                                ((npy_short *)data0 + 2*0)[0] +
                                ((npy_short *)data_out + 2*0)[0];
        ((npy_short *)data_out + 2*0)[1] =
                                ((npy_short *)data0 + 2*0)[1] +
                                ((npy_short *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_short *)data_out + 2*1)[0] =
                                ((npy_short *)data0 + 2*1)[0] +
                                ((npy_short *)data_out + 2*1)[0];
        ((npy_short *)data_out + 2*1)[1] =
                                ((npy_short *)data0 + 2*1)[1] +
                                ((npy_short *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_short *)data_out + 2*2)[0] =
                                ((npy_short *)data0 + 2*2)[0] +
                                ((npy_short *)data_out + 2*2)[0];
        ((npy_short *)data_out + 2*2)[1] =
                                ((npy_short *)data0 + 2*2)[1] +
                                ((npy_short *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_short *)data_out + 2*3)[0] =
                                ((npy_short *)data0 + 2*3)[0] +
                                ((npy_short *)data_out + 2*3)[0];
        ((npy_short *)data_out + 2*3)[1] =
                                ((npy_short *)data0 + 2*3)[1] +
                                ((npy_short *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_short *)data_out + 2*4)[0] =
                                ((npy_short *)data0 + 2*4)[0] +
                                ((npy_short *)data_out + 2*4)[0];
        ((npy_short *)data_out + 2*4)[1] =
                                ((npy_short *)data0 + 2*4)[1] +
                                ((npy_short *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_short *)data_out + 2*5)[0] =
                                ((npy_short *)data0 + 2*5)[0] +
                                ((npy_short *)data_out + 2*5)[0];
        ((npy_short *)data_out + 2*5)[1] =
                                ((npy_short *)data0 + 2*5)[1] +
                                ((npy_short *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_short *)data_out + 2*6)[0] =
                                ((npy_short *)data0 + 2*6)[0] +
                                ((npy_short *)data_out + 2*6)[0];
        ((npy_short *)data_out + 2*6)[1] =
                                ((npy_short *)data0 + 2*6)[1] +
                                ((npy_short *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_short *)data_out + 2*7)[0] =
                                ((npy_short *)data0 + 2*7)[0] +
                                ((npy_short *)data_out + 2*7)[0];
        ((npy_short *)data_out + 2*7)[1] =
                                ((npy_short *)data0 + 2*7)[1] +
                                ((npy_short *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
short_sum_of_products_muladd(npy_short *data, npy_short *data_out, npy_short scalar, npy_intp count)
{
#if 0 // NPYV check for npy_short
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s16;
    const npyv_s16 v_scalar = npyv_setall_s16(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s16 b0 = npyv_loada_s16(data + vstep * 0);
            npyv_s16 c0 = npyv_loada_s16(data_out + vstep * 0);
            
#line 312
            npyv_s16 b1 = npyv_loada_s16(data + vstep * 1);
            npyv_s16 c1 = npyv_loada_s16(data_out + vstep * 1);
            
#line 312
            npyv_s16 b2 = npyv_loada_s16(data + vstep * 2);
            npyv_s16 c2 = npyv_loada_s16(data_out + vstep * 2);
            
#line 312
            npyv_s16 b3 = npyv_loada_s16(data + vstep * 3);
            npyv_s16 c3 = npyv_loada_s16(data_out + vstep * 3);
            
            #line 318
            npyv_s16 abc0 = npyv_muladd_s16(v_scalar, b0, c0);
            
#line 318
            npyv_s16 abc1 = npyv_muladd_s16(v_scalar, b1, c1);
            
#line 318
            npyv_s16 abc2 = npyv_muladd_s16(v_scalar, b2, c2);
            
#line 318
            npyv_s16 abc3 = npyv_muladd_s16(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_s16(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_s16(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_s16(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_s16(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s16 b0 = npyv_load_s16(data + vstep * 0);
            npyv_s16 c0 = npyv_load_s16(data_out + vstep * 0);
            
#line 312
            npyv_s16 b1 = npyv_load_s16(data + vstep * 1);
            npyv_s16 c1 = npyv_load_s16(data_out + vstep * 1);
            
#line 312
            npyv_s16 b2 = npyv_load_s16(data + vstep * 2);
            npyv_s16 c2 = npyv_load_s16(data_out + vstep * 2);
            
#line 312
            npyv_s16 b3 = npyv_load_s16(data + vstep * 3);
            npyv_s16 c3 = npyv_load_s16(data_out + vstep * 3);
            
            #line 318
            npyv_s16 abc0 = npyv_muladd_s16(v_scalar, b0, c0);
            
#line 318
            npyv_s16 abc1 = npyv_muladd_s16(v_scalar, b1, c1);
            
#line 318
            npyv_s16 abc2 = npyv_muladd_s16(v_scalar, b2, c2);
            
#line 318
            npyv_s16 abc3 = npyv_muladd_s16(v_scalar, b3, c3);
            
            #line 323
            npyv_store_s16(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_s16(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_s16(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_s16(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_s16 a = npyv_load_tillz_s16(data, count);
        npyv_s16 b = npyv_load_tillz_s16(data_out, count);
        npyv_store_till_s16(data_out, count, npyv_muladd_s16(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_short b0 = (data[0]);
        const npy_short c0 = (data_out[0]);
        
#line 340
        const npy_short b1 = (data[1]);
        const npy_short c1 = (data_out[1]);
        
#line 340
        const npy_short b2 = (data[2]);
        const npy_short c2 = (data_out[2]);
        
#line 340
        const npy_short b3 = (data[3]);
        const npy_short c3 = (data_out[3]);
        
        #line 346
        const npy_short abc0 = scalar * b0 + c0;
        
#line 346
        const npy_short abc1 = scalar * b1 + c1;
        
#line 346
        const npy_short abc2 = scalar * b2 + c2;
        
#line 346
        const npy_short abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_short b = (*data);
        const npy_short c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_short
}

static void
short_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data_out = (npy_short *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_short
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s16;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s16 a0 = npyv_loada_s16(data0 + vstep * 0);
            npyv_s16 b0 = npyv_loada_s16(data1 + vstep * 0);
            npyv_s16 c0 = npyv_loada_s16(data_out + vstep * 0);
            
#line 390
            npyv_s16 a1 = npyv_loada_s16(data0 + vstep * 1);
            npyv_s16 b1 = npyv_loada_s16(data1 + vstep * 1);
            npyv_s16 c1 = npyv_loada_s16(data_out + vstep * 1);
            
#line 390
            npyv_s16 a2 = npyv_loada_s16(data0 + vstep * 2);
            npyv_s16 b2 = npyv_loada_s16(data1 + vstep * 2);
            npyv_s16 c2 = npyv_loada_s16(data_out + vstep * 2);
            
#line 390
            npyv_s16 a3 = npyv_loada_s16(data0 + vstep * 3);
            npyv_s16 b3 = npyv_loada_s16(data1 + vstep * 3);
            npyv_s16 c3 = npyv_loada_s16(data_out + vstep * 3);
            
            #line 397
            npyv_s16 abc0 = npyv_muladd_s16(a0, b0, c0);
            
#line 397
            npyv_s16 abc1 = npyv_muladd_s16(a1, b1, c1);
            
#line 397
            npyv_s16 abc2 = npyv_muladd_s16(a2, b2, c2);
            
#line 397
            npyv_s16 abc3 = npyv_muladd_s16(a3, b3, c3);
            
            #line 402
            npyv_storea_s16(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_s16(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_s16(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_s16(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s16 a0 = npyv_load_s16(data0 + vstep * 0);
            npyv_s16 b0 = npyv_load_s16(data1 + vstep * 0);
            npyv_s16 c0 = npyv_load_s16(data_out + vstep * 0);
            
#line 390
            npyv_s16 a1 = npyv_load_s16(data0 + vstep * 1);
            npyv_s16 b1 = npyv_load_s16(data1 + vstep * 1);
            npyv_s16 c1 = npyv_load_s16(data_out + vstep * 1);
            
#line 390
            npyv_s16 a2 = npyv_load_s16(data0 + vstep * 2);
            npyv_s16 b2 = npyv_load_s16(data1 + vstep * 2);
            npyv_s16 c2 = npyv_load_s16(data_out + vstep * 2);
            
#line 390
            npyv_s16 a3 = npyv_load_s16(data0 + vstep * 3);
            npyv_s16 b3 = npyv_load_s16(data1 + vstep * 3);
            npyv_s16 c3 = npyv_load_s16(data_out + vstep * 3);
            
            #line 397
            npyv_s16 abc0 = npyv_muladd_s16(a0, b0, c0);
            
#line 397
            npyv_s16 abc1 = npyv_muladd_s16(a1, b1, c1);
            
#line 397
            npyv_s16 abc2 = npyv_muladd_s16(a2, b2, c2);
            
#line 397
            npyv_s16 abc3 = npyv_muladd_s16(a3, b3, c3);
            
            #line 402
            npyv_store_s16(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_s16(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_s16(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_s16(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_s16 a = npyv_load_tillz_s16(data0, count);
        npyv_s16 b = npyv_load_tillz_s16(data1, count);
        npyv_s16 c = npyv_load_tillz_s16(data_out, count);
        npyv_store_till_s16(data_out, count, npyv_muladd_s16(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_short a0 = (data0[0]);
        const npy_short b0 = (data1[0]);
        const npy_short c0 = (data_out[0]);
        
#line 420
        const npy_short a1 = (data0[1]);
        const npy_short b1 = (data1[1]);
        const npy_short c1 = (data_out[1]);
        
#line 420
        const npy_short a2 = (data0[2]);
        const npy_short b2 = (data1[2]);
        const npy_short c2 = (data_out[2]);
        
#line 420
        const npy_short a3 = (data0[3]);
        const npy_short b3 = (data1[3]);
        const npy_short c3 = (data_out[3]);
        
        #line 427
        const npy_short abc0 = a0 * b0 + c0;
        
#line 427
        const npy_short abc1 = a1 * b1 + c1;
        
#line 427
        const npy_short abc2 = a2 * b2 + c2;
        
#line 427
        const npy_short abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_short a = (*data0);
        const npy_short b = (*data1);
        const npy_short c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_short

}

/* Some extra specializations for the two operand case */
static void
short_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short value0 = (*(npy_short *)dataptr[0]);
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data_out = (npy_short *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    short_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
short_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short value1 = (*(npy_short *)dataptr[1]);
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data_out = (npy_short *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    short_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
short_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short accum = 0;

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_short
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_s16;
    npyv_s16 vaccum = npyv_zero_s16();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s16 a0 = npyv_loada_s16(data0 + vstep * 0);
            npyv_s16 b0 = npyv_loada_s16(data1 + vstep * 0);
            
#line 501
            npyv_s16 a1 = npyv_loada_s16(data0 + vstep * 1);
            npyv_s16 b1 = npyv_loada_s16(data1 + vstep * 1);
            
#line 501
            npyv_s16 a2 = npyv_loada_s16(data0 + vstep * 2);
            npyv_s16 b2 = npyv_loada_s16(data1 + vstep * 2);
            
#line 501
            npyv_s16 a3 = npyv_loada_s16(data0 + vstep * 3);
            npyv_s16 b3 = npyv_loada_s16(data1 + vstep * 3);
            
            npyv_s16 ab3 = npyv_muladd_s16(a3, b3, vaccum);
            npyv_s16 ab2 = npyv_muladd_s16(a2, b2, ab3);
            npyv_s16 ab1 = npyv_muladd_s16(a1, b1, ab2);
                    vaccum = npyv_muladd_s16(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s16 a0 = npyv_load_s16(data0 + vstep * 0);
            npyv_s16 b0 = npyv_load_s16(data1 + vstep * 0);
            
#line 501
            npyv_s16 a1 = npyv_load_s16(data0 + vstep * 1);
            npyv_s16 b1 = npyv_load_s16(data1 + vstep * 1);
            
#line 501
            npyv_s16 a2 = npyv_load_s16(data0 + vstep * 2);
            npyv_s16 b2 = npyv_load_s16(data1 + vstep * 2);
            
#line 501
            npyv_s16 a3 = npyv_load_s16(data0 + vstep * 3);
            npyv_s16 b3 = npyv_load_s16(data1 + vstep * 3);
            
            npyv_s16 ab3 = npyv_muladd_s16(a3, b3, vaccum);
            npyv_s16 ab2 = npyv_muladd_s16(a2, b2, ab3);
            npyv_s16 ab1 = npyv_muladd_s16(a1, b1, ab2);
                    vaccum = npyv_muladd_s16(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_s16 a = npyv_load_tillz_s16(data0, count);
        npyv_s16 b = npyv_load_tillz_s16(data1, count);
        vaccum = npyv_muladd_s16(a, b, vaccum);
    }
    accum = npyv_sum_s16(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_short ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_short ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_short ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_short ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_short a = (*data0);
        const npy_short b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_short
    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
short_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short value0 = (*(npy_short *)dataptr[0]);
    npy_short accum = short_sum_of_arr(data1, count);
    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
short_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short value1 = (*(npy_short *)dataptr[1]);
    npy_short accum = short_sum_of_arr(data0, count);
    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + value1 * accum);
}

#elif 1 == 3 && !0

static void
short_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data2 = (npy_short *)dataptr[2];
    npy_short *data_out = (npy_short *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1 > 3 || @complex */

static void
short_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        *(npy_short *)dataptr[nop] = (temp +
                                           (*(npy_short *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_short);
        }
#else /* complex */
#  if 1 <= 3
#    define _SUMPROD_NOP 1
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_short);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1 */

#if 1 == 1

static NPY_GCC_OPT_3 void
short_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_short *data = (npy_short *)dataptr[0];
    npy_short accum = short_sum_of_arr(data, count);
    *((npy_short *)dataptr[1]) = (accum + (*((npy_short *)dataptr[1])));
#else
    npy_short accum_re = 0, accum_im = 0;
    npy_short *data0 = (npy_short *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_short re01 = data0[0] + data0[2];
        const npy_short re23 = data0[4] + data0[6];
        const npy_short im13 = data0[1] + data0[3];
        const npy_short im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_short *)dataptr[1])[0] += accum_re;
    ((npy_short *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 1 == 1 */

static void
short_sum_of_products_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_short accum_re = 0, accum_im = 0;
#else
    npy_short accum = 0;
#endif

#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_outstride0_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        accum += (*(npy_short *)data0);
        data0 += stride0;
#  elif 1 == 2
        accum += (*(npy_short *)data0) *
                 (*(npy_short *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1 == 3
        accum += (*(npy_short *)data0) *
                 (*(npy_short *)data1) *
                 (*(npy_short *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        accum_re += ((npy_short *)data0)[0];
        accum_im += ((npy_short *)data0)[1];
        data0 += stride0;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1 <= 3
    ((npy_short *)dataptr[1])[0] += accum_re;
    ((npy_short *)dataptr[1])[1] += accum_im;
#  else
    ((npy_short *)dataptr[nop])[0] += accum_re;
    ((npy_short *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1 <= 3
    *((npy_short *)dataptr[1]) = (accum +
                                    (*((npy_short *)dataptr[1])));
#  else
    *((npy_short *)dataptr[nop]) = (accum +
                                    (*((npy_short *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
short_sum_of_products_two(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (2 == 1) || (2 <= 3 && !0)
    char *data_out = dataptr[2];
    npy_intp stride_out = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_two (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        *(npy_short *)data_out = ((*(npy_short *)data0) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 2 == 2
        *(npy_short *)data_out = ((*(npy_short *)data0) *
                                         (*(npy_short *)data1) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 2 == 3
        *(npy_short *)data_out = ((*(npy_short *)data0) *
                                         (*(npy_short *)data1) *
                                         (*(npy_short *)data2) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        *(npy_short *)dataptr[nop] = (temp +
                                           (*(npy_short *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        ((npy_short *)data_out)[0] = ((npy_short *)data0)[0] +
                                         ((npy_short *)data_out)[0];
        ((npy_short *)data_out)[1] = ((npy_short *)data0)[1] +
                                         ((npy_short *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 2 == 1

static void
short_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data_out = (npy_short *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_short *)data_out + 2*6)[0] =
                                    ((npy_short *)data0 + 2*6)[0] +
                                    ((npy_short *)data_out + 2*6)[0];
            ((npy_short *)data_out + 2*6)[1] =
                                    ((npy_short *)data0 + 2*6)[1] +
                                    ((npy_short *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_short *)data_out + 2*5)[0] =
                                    ((npy_short *)data0 + 2*5)[0] +
                                    ((npy_short *)data_out + 2*5)[0];
            ((npy_short *)data_out + 2*5)[1] =
                                    ((npy_short *)data0 + 2*5)[1] +
                                    ((npy_short *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_short *)data_out + 2*4)[0] =
                                    ((npy_short *)data0 + 2*4)[0] +
                                    ((npy_short *)data_out + 2*4)[0];
            ((npy_short *)data_out + 2*4)[1] =
                                    ((npy_short *)data0 + 2*4)[1] +
                                    ((npy_short *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_short *)data_out + 2*3)[0] =
                                    ((npy_short *)data0 + 2*3)[0] +
                                    ((npy_short *)data_out + 2*3)[0];
            ((npy_short *)data_out + 2*3)[1] =
                                    ((npy_short *)data0 + 2*3)[1] +
                                    ((npy_short *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_short *)data_out + 2*2)[0] =
                                    ((npy_short *)data0 + 2*2)[0] +
                                    ((npy_short *)data_out + 2*2)[0];
            ((npy_short *)data_out + 2*2)[1] =
                                    ((npy_short *)data0 + 2*2)[1] +
                                    ((npy_short *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_short *)data_out + 2*1)[0] =
                                    ((npy_short *)data0 + 2*1)[0] +
                                    ((npy_short *)data_out + 2*1)[0];
            ((npy_short *)data_out + 2*1)[1] =
                                    ((npy_short *)data0 + 2*1)[1] +
                                    ((npy_short *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_short *)data_out + 2*0)[0] =
                                    ((npy_short *)data0 + 2*0)[0] +
                                    ((npy_short *)data_out + 2*0)[0];
            ((npy_short *)data_out + 2*0)[1] =
                                    ((npy_short *)data0 + 2*0)[1] +
                                    ((npy_short *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_short *)data_out + 2*0)[0] =
                                ((npy_short *)data0 + 2*0)[0] +
                                ((npy_short *)data_out + 2*0)[0];
        ((npy_short *)data_out + 2*0)[1] =
                                ((npy_short *)data0 + 2*0)[1] +
                                ((npy_short *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_short *)data_out + 2*1)[0] =
                                ((npy_short *)data0 + 2*1)[0] +
                                ((npy_short *)data_out + 2*1)[0];
        ((npy_short *)data_out + 2*1)[1] =
                                ((npy_short *)data0 + 2*1)[1] +
                                ((npy_short *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_short *)data_out + 2*2)[0] =
                                ((npy_short *)data0 + 2*2)[0] +
                                ((npy_short *)data_out + 2*2)[0];
        ((npy_short *)data_out + 2*2)[1] =
                                ((npy_short *)data0 + 2*2)[1] +
                                ((npy_short *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_short *)data_out + 2*3)[0] =
                                ((npy_short *)data0 + 2*3)[0] +
                                ((npy_short *)data_out + 2*3)[0];
        ((npy_short *)data_out + 2*3)[1] =
                                ((npy_short *)data0 + 2*3)[1] +
                                ((npy_short *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_short *)data_out + 2*4)[0] =
                                ((npy_short *)data0 + 2*4)[0] +
                                ((npy_short *)data_out + 2*4)[0];
        ((npy_short *)data_out + 2*4)[1] =
                                ((npy_short *)data0 + 2*4)[1] +
                                ((npy_short *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_short *)data_out + 2*5)[0] =
                                ((npy_short *)data0 + 2*5)[0] +
                                ((npy_short *)data_out + 2*5)[0];
        ((npy_short *)data_out + 2*5)[1] =
                                ((npy_short *)data0 + 2*5)[1] +
                                ((npy_short *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_short *)data_out + 2*6)[0] =
                                ((npy_short *)data0 + 2*6)[0] +
                                ((npy_short *)data_out + 2*6)[0];
        ((npy_short *)data_out + 2*6)[1] =
                                ((npy_short *)data0 + 2*6)[1] +
                                ((npy_short *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_short *)data_out + 2*7)[0] =
                                ((npy_short *)data0 + 2*7)[0] +
                                ((npy_short *)data_out + 2*7)[0];
        ((npy_short *)data_out + 2*7)[1] =
                                ((npy_short *)data0 + 2*7)[1] +
                                ((npy_short *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 2 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
short_sum_of_products_muladd(npy_short *data, npy_short *data_out, npy_short scalar, npy_intp count)
{
#if 0 // NPYV check for npy_short
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s16;
    const npyv_s16 v_scalar = npyv_setall_s16(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s16 b0 = npyv_loada_s16(data + vstep * 0);
            npyv_s16 c0 = npyv_loada_s16(data_out + vstep * 0);
            
#line 312
            npyv_s16 b1 = npyv_loada_s16(data + vstep * 1);
            npyv_s16 c1 = npyv_loada_s16(data_out + vstep * 1);
            
#line 312
            npyv_s16 b2 = npyv_loada_s16(data + vstep * 2);
            npyv_s16 c2 = npyv_loada_s16(data_out + vstep * 2);
            
#line 312
            npyv_s16 b3 = npyv_loada_s16(data + vstep * 3);
            npyv_s16 c3 = npyv_loada_s16(data_out + vstep * 3);
            
            #line 318
            npyv_s16 abc0 = npyv_muladd_s16(v_scalar, b0, c0);
            
#line 318
            npyv_s16 abc1 = npyv_muladd_s16(v_scalar, b1, c1);
            
#line 318
            npyv_s16 abc2 = npyv_muladd_s16(v_scalar, b2, c2);
            
#line 318
            npyv_s16 abc3 = npyv_muladd_s16(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_s16(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_s16(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_s16(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_s16(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s16 b0 = npyv_load_s16(data + vstep * 0);
            npyv_s16 c0 = npyv_load_s16(data_out + vstep * 0);
            
#line 312
            npyv_s16 b1 = npyv_load_s16(data + vstep * 1);
            npyv_s16 c1 = npyv_load_s16(data_out + vstep * 1);
            
#line 312
            npyv_s16 b2 = npyv_load_s16(data + vstep * 2);
            npyv_s16 c2 = npyv_load_s16(data_out + vstep * 2);
            
#line 312
            npyv_s16 b3 = npyv_load_s16(data + vstep * 3);
            npyv_s16 c3 = npyv_load_s16(data_out + vstep * 3);
            
            #line 318
            npyv_s16 abc0 = npyv_muladd_s16(v_scalar, b0, c0);
            
#line 318
            npyv_s16 abc1 = npyv_muladd_s16(v_scalar, b1, c1);
            
#line 318
            npyv_s16 abc2 = npyv_muladd_s16(v_scalar, b2, c2);
            
#line 318
            npyv_s16 abc3 = npyv_muladd_s16(v_scalar, b3, c3);
            
            #line 323
            npyv_store_s16(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_s16(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_s16(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_s16(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_s16 a = npyv_load_tillz_s16(data, count);
        npyv_s16 b = npyv_load_tillz_s16(data_out, count);
        npyv_store_till_s16(data_out, count, npyv_muladd_s16(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_short b0 = (data[0]);
        const npy_short c0 = (data_out[0]);
        
#line 340
        const npy_short b1 = (data[1]);
        const npy_short c1 = (data_out[1]);
        
#line 340
        const npy_short b2 = (data[2]);
        const npy_short c2 = (data_out[2]);
        
#line 340
        const npy_short b3 = (data[3]);
        const npy_short c3 = (data_out[3]);
        
        #line 346
        const npy_short abc0 = scalar * b0 + c0;
        
#line 346
        const npy_short abc1 = scalar * b1 + c1;
        
#line 346
        const npy_short abc2 = scalar * b2 + c2;
        
#line 346
        const npy_short abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_short b = (*data);
        const npy_short c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_short
}

static void
short_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data_out = (npy_short *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_short
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s16;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s16 a0 = npyv_loada_s16(data0 + vstep * 0);
            npyv_s16 b0 = npyv_loada_s16(data1 + vstep * 0);
            npyv_s16 c0 = npyv_loada_s16(data_out + vstep * 0);
            
#line 390
            npyv_s16 a1 = npyv_loada_s16(data0 + vstep * 1);
            npyv_s16 b1 = npyv_loada_s16(data1 + vstep * 1);
            npyv_s16 c1 = npyv_loada_s16(data_out + vstep * 1);
            
#line 390
            npyv_s16 a2 = npyv_loada_s16(data0 + vstep * 2);
            npyv_s16 b2 = npyv_loada_s16(data1 + vstep * 2);
            npyv_s16 c2 = npyv_loada_s16(data_out + vstep * 2);
            
#line 390
            npyv_s16 a3 = npyv_loada_s16(data0 + vstep * 3);
            npyv_s16 b3 = npyv_loada_s16(data1 + vstep * 3);
            npyv_s16 c3 = npyv_loada_s16(data_out + vstep * 3);
            
            #line 397
            npyv_s16 abc0 = npyv_muladd_s16(a0, b0, c0);
            
#line 397
            npyv_s16 abc1 = npyv_muladd_s16(a1, b1, c1);
            
#line 397
            npyv_s16 abc2 = npyv_muladd_s16(a2, b2, c2);
            
#line 397
            npyv_s16 abc3 = npyv_muladd_s16(a3, b3, c3);
            
            #line 402
            npyv_storea_s16(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_s16(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_s16(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_s16(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s16 a0 = npyv_load_s16(data0 + vstep * 0);
            npyv_s16 b0 = npyv_load_s16(data1 + vstep * 0);
            npyv_s16 c0 = npyv_load_s16(data_out + vstep * 0);
            
#line 390
            npyv_s16 a1 = npyv_load_s16(data0 + vstep * 1);
            npyv_s16 b1 = npyv_load_s16(data1 + vstep * 1);
            npyv_s16 c1 = npyv_load_s16(data_out + vstep * 1);
            
#line 390
            npyv_s16 a2 = npyv_load_s16(data0 + vstep * 2);
            npyv_s16 b2 = npyv_load_s16(data1 + vstep * 2);
            npyv_s16 c2 = npyv_load_s16(data_out + vstep * 2);
            
#line 390
            npyv_s16 a3 = npyv_load_s16(data0 + vstep * 3);
            npyv_s16 b3 = npyv_load_s16(data1 + vstep * 3);
            npyv_s16 c3 = npyv_load_s16(data_out + vstep * 3);
            
            #line 397
            npyv_s16 abc0 = npyv_muladd_s16(a0, b0, c0);
            
#line 397
            npyv_s16 abc1 = npyv_muladd_s16(a1, b1, c1);
            
#line 397
            npyv_s16 abc2 = npyv_muladd_s16(a2, b2, c2);
            
#line 397
            npyv_s16 abc3 = npyv_muladd_s16(a3, b3, c3);
            
            #line 402
            npyv_store_s16(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_s16(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_s16(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_s16(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_s16 a = npyv_load_tillz_s16(data0, count);
        npyv_s16 b = npyv_load_tillz_s16(data1, count);
        npyv_s16 c = npyv_load_tillz_s16(data_out, count);
        npyv_store_till_s16(data_out, count, npyv_muladd_s16(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_short a0 = (data0[0]);
        const npy_short b0 = (data1[0]);
        const npy_short c0 = (data_out[0]);
        
#line 420
        const npy_short a1 = (data0[1]);
        const npy_short b1 = (data1[1]);
        const npy_short c1 = (data_out[1]);
        
#line 420
        const npy_short a2 = (data0[2]);
        const npy_short b2 = (data1[2]);
        const npy_short c2 = (data_out[2]);
        
#line 420
        const npy_short a3 = (data0[3]);
        const npy_short b3 = (data1[3]);
        const npy_short c3 = (data_out[3]);
        
        #line 427
        const npy_short abc0 = a0 * b0 + c0;
        
#line 427
        const npy_short abc1 = a1 * b1 + c1;
        
#line 427
        const npy_short abc2 = a2 * b2 + c2;
        
#line 427
        const npy_short abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_short a = (*data0);
        const npy_short b = (*data1);
        const npy_short c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_short

}

/* Some extra specializations for the two operand case */
static void
short_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short value0 = (*(npy_short *)dataptr[0]);
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data_out = (npy_short *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    short_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
short_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short value1 = (*(npy_short *)dataptr[1]);
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data_out = (npy_short *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    short_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
short_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short accum = 0;

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_short
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_s16;
    npyv_s16 vaccum = npyv_zero_s16();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s16 a0 = npyv_loada_s16(data0 + vstep * 0);
            npyv_s16 b0 = npyv_loada_s16(data1 + vstep * 0);
            
#line 501
            npyv_s16 a1 = npyv_loada_s16(data0 + vstep * 1);
            npyv_s16 b1 = npyv_loada_s16(data1 + vstep * 1);
            
#line 501
            npyv_s16 a2 = npyv_loada_s16(data0 + vstep * 2);
            npyv_s16 b2 = npyv_loada_s16(data1 + vstep * 2);
            
#line 501
            npyv_s16 a3 = npyv_loada_s16(data0 + vstep * 3);
            npyv_s16 b3 = npyv_loada_s16(data1 + vstep * 3);
            
            npyv_s16 ab3 = npyv_muladd_s16(a3, b3, vaccum);
            npyv_s16 ab2 = npyv_muladd_s16(a2, b2, ab3);
            npyv_s16 ab1 = npyv_muladd_s16(a1, b1, ab2);
                    vaccum = npyv_muladd_s16(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s16 a0 = npyv_load_s16(data0 + vstep * 0);
            npyv_s16 b0 = npyv_load_s16(data1 + vstep * 0);
            
#line 501
            npyv_s16 a1 = npyv_load_s16(data0 + vstep * 1);
            npyv_s16 b1 = npyv_load_s16(data1 + vstep * 1);
            
#line 501
            npyv_s16 a2 = npyv_load_s16(data0 + vstep * 2);
            npyv_s16 b2 = npyv_load_s16(data1 + vstep * 2);
            
#line 501
            npyv_s16 a3 = npyv_load_s16(data0 + vstep * 3);
            npyv_s16 b3 = npyv_load_s16(data1 + vstep * 3);
            
            npyv_s16 ab3 = npyv_muladd_s16(a3, b3, vaccum);
            npyv_s16 ab2 = npyv_muladd_s16(a2, b2, ab3);
            npyv_s16 ab1 = npyv_muladd_s16(a1, b1, ab2);
                    vaccum = npyv_muladd_s16(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_s16 a = npyv_load_tillz_s16(data0, count);
        npyv_s16 b = npyv_load_tillz_s16(data1, count);
        vaccum = npyv_muladd_s16(a, b, vaccum);
    }
    accum = npyv_sum_s16(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_short ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_short ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_short ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_short ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_short a = (*data0);
        const npy_short b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_short
    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
short_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short value0 = (*(npy_short *)dataptr[0]);
    npy_short accum = short_sum_of_arr(data1, count);
    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
short_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short value1 = (*(npy_short *)dataptr[1]);
    npy_short accum = short_sum_of_arr(data0, count);
    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + value1 * accum);
}

#elif 2 == 3 && !0

static void
short_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data2 = (npy_short *)dataptr[2];
    npy_short *data_out = (npy_short *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 2 > 3 || @complex */

static void
short_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        *(npy_short *)dataptr[nop] = (temp +
                                           (*(npy_short *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_short);
        }
#else /* complex */
#  if 2 <= 3
#    define _SUMPROD_NOP 2
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_short);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 2 */

#if 2 == 1

static NPY_GCC_OPT_3 void
short_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_short *data = (npy_short *)dataptr[0];
    npy_short accum = short_sum_of_arr(data, count);
    *((npy_short *)dataptr[1]) = (accum + (*((npy_short *)dataptr[1])));
#else
    npy_short accum_re = 0, accum_im = 0;
    npy_short *data0 = (npy_short *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_short re01 = data0[0] + data0[2];
        const npy_short re23 = data0[4] + data0[6];
        const npy_short im13 = data0[1] + data0[3];
        const npy_short im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_short *)dataptr[1])[0] += accum_re;
    ((npy_short *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 2 == 1 */

static void
short_sum_of_products_outstride0_two(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_short accum_re = 0, accum_im = 0;
#else
    npy_short accum = 0;
#endif

#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_outstride0_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        accum += (*(npy_short *)data0);
        data0 += stride0;
#  elif 2 == 2
        accum += (*(npy_short *)data0) *
                 (*(npy_short *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 2 == 3
        accum += (*(npy_short *)data0) *
                 (*(npy_short *)data1) *
                 (*(npy_short *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        accum_re += ((npy_short *)data0)[0];
        accum_im += ((npy_short *)data0)[1];
        data0 += stride0;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 2 <= 3
    ((npy_short *)dataptr[2])[0] += accum_re;
    ((npy_short *)dataptr[2])[1] += accum_im;
#  else
    ((npy_short *)dataptr[nop])[0] += accum_re;
    ((npy_short *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 2 <= 3
    *((npy_short *)dataptr[2]) = (accum +
                                    (*((npy_short *)dataptr[2])));
#  else
    *((npy_short *)dataptr[nop]) = (accum +
                                    (*((npy_short *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
short_sum_of_products_three(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (3 == 1) || (3 <= 3 && !0)
    char *data_out = dataptr[3];
    npy_intp stride_out = strides[3];
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_three (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        *(npy_short *)data_out = ((*(npy_short *)data0) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 3 == 2
        *(npy_short *)data_out = ((*(npy_short *)data0) *
                                         (*(npy_short *)data1) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 3 == 3
        *(npy_short *)data_out = ((*(npy_short *)data0) *
                                         (*(npy_short *)data1) *
                                         (*(npy_short *)data2) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        *(npy_short *)dataptr[nop] = (temp +
                                           (*(npy_short *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        ((npy_short *)data_out)[0] = ((npy_short *)data0)[0] +
                                         ((npy_short *)data_out)[0];
        ((npy_short *)data_out)[1] = ((npy_short *)data0)[1] +
                                         ((npy_short *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 3 == 1

static void
short_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data_out = (npy_short *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_short *)data_out + 2*6)[0] =
                                    ((npy_short *)data0 + 2*6)[0] +
                                    ((npy_short *)data_out + 2*6)[0];
            ((npy_short *)data_out + 2*6)[1] =
                                    ((npy_short *)data0 + 2*6)[1] +
                                    ((npy_short *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_short *)data_out + 2*5)[0] =
                                    ((npy_short *)data0 + 2*5)[0] +
                                    ((npy_short *)data_out + 2*5)[0];
            ((npy_short *)data_out + 2*5)[1] =
                                    ((npy_short *)data0 + 2*5)[1] +
                                    ((npy_short *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_short *)data_out + 2*4)[0] =
                                    ((npy_short *)data0 + 2*4)[0] +
                                    ((npy_short *)data_out + 2*4)[0];
            ((npy_short *)data_out + 2*4)[1] =
                                    ((npy_short *)data0 + 2*4)[1] +
                                    ((npy_short *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_short *)data_out + 2*3)[0] =
                                    ((npy_short *)data0 + 2*3)[0] +
                                    ((npy_short *)data_out + 2*3)[0];
            ((npy_short *)data_out + 2*3)[1] =
                                    ((npy_short *)data0 + 2*3)[1] +
                                    ((npy_short *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_short *)data_out + 2*2)[0] =
                                    ((npy_short *)data0 + 2*2)[0] +
                                    ((npy_short *)data_out + 2*2)[0];
            ((npy_short *)data_out + 2*2)[1] =
                                    ((npy_short *)data0 + 2*2)[1] +
                                    ((npy_short *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_short *)data_out + 2*1)[0] =
                                    ((npy_short *)data0 + 2*1)[0] +
                                    ((npy_short *)data_out + 2*1)[0];
            ((npy_short *)data_out + 2*1)[1] =
                                    ((npy_short *)data0 + 2*1)[1] +
                                    ((npy_short *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_short *)data_out + 2*0)[0] =
                                    ((npy_short *)data0 + 2*0)[0] +
                                    ((npy_short *)data_out + 2*0)[0];
            ((npy_short *)data_out + 2*0)[1] =
                                    ((npy_short *)data0 + 2*0)[1] +
                                    ((npy_short *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_short *)data_out + 2*0)[0] =
                                ((npy_short *)data0 + 2*0)[0] +
                                ((npy_short *)data_out + 2*0)[0];
        ((npy_short *)data_out + 2*0)[1] =
                                ((npy_short *)data0 + 2*0)[1] +
                                ((npy_short *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_short *)data_out + 2*1)[0] =
                                ((npy_short *)data0 + 2*1)[0] +
                                ((npy_short *)data_out + 2*1)[0];
        ((npy_short *)data_out + 2*1)[1] =
                                ((npy_short *)data0 + 2*1)[1] +
                                ((npy_short *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_short *)data_out + 2*2)[0] =
                                ((npy_short *)data0 + 2*2)[0] +
                                ((npy_short *)data_out + 2*2)[0];
        ((npy_short *)data_out + 2*2)[1] =
                                ((npy_short *)data0 + 2*2)[1] +
                                ((npy_short *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_short *)data_out + 2*3)[0] =
                                ((npy_short *)data0 + 2*3)[0] +
                                ((npy_short *)data_out + 2*3)[0];
        ((npy_short *)data_out + 2*3)[1] =
                                ((npy_short *)data0 + 2*3)[1] +
                                ((npy_short *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_short *)data_out + 2*4)[0] =
                                ((npy_short *)data0 + 2*4)[0] +
                                ((npy_short *)data_out + 2*4)[0];
        ((npy_short *)data_out + 2*4)[1] =
                                ((npy_short *)data0 + 2*4)[1] +
                                ((npy_short *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_short *)data_out + 2*5)[0] =
                                ((npy_short *)data0 + 2*5)[0] +
                                ((npy_short *)data_out + 2*5)[0];
        ((npy_short *)data_out + 2*5)[1] =
                                ((npy_short *)data0 + 2*5)[1] +
                                ((npy_short *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_short *)data_out + 2*6)[0] =
                                ((npy_short *)data0 + 2*6)[0] +
                                ((npy_short *)data_out + 2*6)[0];
        ((npy_short *)data_out + 2*6)[1] =
                                ((npy_short *)data0 + 2*6)[1] +
                                ((npy_short *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_short *)data_out + 2*7)[0] =
                                ((npy_short *)data0 + 2*7)[0] +
                                ((npy_short *)data_out + 2*7)[0];
        ((npy_short *)data_out + 2*7)[1] =
                                ((npy_short *)data0 + 2*7)[1] +
                                ((npy_short *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 3 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
short_sum_of_products_muladd(npy_short *data, npy_short *data_out, npy_short scalar, npy_intp count)
{
#if 0 // NPYV check for npy_short
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s16;
    const npyv_s16 v_scalar = npyv_setall_s16(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s16 b0 = npyv_loada_s16(data + vstep * 0);
            npyv_s16 c0 = npyv_loada_s16(data_out + vstep * 0);
            
#line 312
            npyv_s16 b1 = npyv_loada_s16(data + vstep * 1);
            npyv_s16 c1 = npyv_loada_s16(data_out + vstep * 1);
            
#line 312
            npyv_s16 b2 = npyv_loada_s16(data + vstep * 2);
            npyv_s16 c2 = npyv_loada_s16(data_out + vstep * 2);
            
#line 312
            npyv_s16 b3 = npyv_loada_s16(data + vstep * 3);
            npyv_s16 c3 = npyv_loada_s16(data_out + vstep * 3);
            
            #line 318
            npyv_s16 abc0 = npyv_muladd_s16(v_scalar, b0, c0);
            
#line 318
            npyv_s16 abc1 = npyv_muladd_s16(v_scalar, b1, c1);
            
#line 318
            npyv_s16 abc2 = npyv_muladd_s16(v_scalar, b2, c2);
            
#line 318
            npyv_s16 abc3 = npyv_muladd_s16(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_s16(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_s16(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_s16(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_s16(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s16 b0 = npyv_load_s16(data + vstep * 0);
            npyv_s16 c0 = npyv_load_s16(data_out + vstep * 0);
            
#line 312
            npyv_s16 b1 = npyv_load_s16(data + vstep * 1);
            npyv_s16 c1 = npyv_load_s16(data_out + vstep * 1);
            
#line 312
            npyv_s16 b2 = npyv_load_s16(data + vstep * 2);
            npyv_s16 c2 = npyv_load_s16(data_out + vstep * 2);
            
#line 312
            npyv_s16 b3 = npyv_load_s16(data + vstep * 3);
            npyv_s16 c3 = npyv_load_s16(data_out + vstep * 3);
            
            #line 318
            npyv_s16 abc0 = npyv_muladd_s16(v_scalar, b0, c0);
            
#line 318
            npyv_s16 abc1 = npyv_muladd_s16(v_scalar, b1, c1);
            
#line 318
            npyv_s16 abc2 = npyv_muladd_s16(v_scalar, b2, c2);
            
#line 318
            npyv_s16 abc3 = npyv_muladd_s16(v_scalar, b3, c3);
            
            #line 323
            npyv_store_s16(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_s16(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_s16(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_s16(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_s16 a = npyv_load_tillz_s16(data, count);
        npyv_s16 b = npyv_load_tillz_s16(data_out, count);
        npyv_store_till_s16(data_out, count, npyv_muladd_s16(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_short b0 = (data[0]);
        const npy_short c0 = (data_out[0]);
        
#line 340
        const npy_short b1 = (data[1]);
        const npy_short c1 = (data_out[1]);
        
#line 340
        const npy_short b2 = (data[2]);
        const npy_short c2 = (data_out[2]);
        
#line 340
        const npy_short b3 = (data[3]);
        const npy_short c3 = (data_out[3]);
        
        #line 346
        const npy_short abc0 = scalar * b0 + c0;
        
#line 346
        const npy_short abc1 = scalar * b1 + c1;
        
#line 346
        const npy_short abc2 = scalar * b2 + c2;
        
#line 346
        const npy_short abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_short b = (*data);
        const npy_short c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_short
}

static void
short_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data_out = (npy_short *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_short
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s16;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s16 a0 = npyv_loada_s16(data0 + vstep * 0);
            npyv_s16 b0 = npyv_loada_s16(data1 + vstep * 0);
            npyv_s16 c0 = npyv_loada_s16(data_out + vstep * 0);
            
#line 390
            npyv_s16 a1 = npyv_loada_s16(data0 + vstep * 1);
            npyv_s16 b1 = npyv_loada_s16(data1 + vstep * 1);
            npyv_s16 c1 = npyv_loada_s16(data_out + vstep * 1);
            
#line 390
            npyv_s16 a2 = npyv_loada_s16(data0 + vstep * 2);
            npyv_s16 b2 = npyv_loada_s16(data1 + vstep * 2);
            npyv_s16 c2 = npyv_loada_s16(data_out + vstep * 2);
            
#line 390
            npyv_s16 a3 = npyv_loada_s16(data0 + vstep * 3);
            npyv_s16 b3 = npyv_loada_s16(data1 + vstep * 3);
            npyv_s16 c3 = npyv_loada_s16(data_out + vstep * 3);
            
            #line 397
            npyv_s16 abc0 = npyv_muladd_s16(a0, b0, c0);
            
#line 397
            npyv_s16 abc1 = npyv_muladd_s16(a1, b1, c1);
            
#line 397
            npyv_s16 abc2 = npyv_muladd_s16(a2, b2, c2);
            
#line 397
            npyv_s16 abc3 = npyv_muladd_s16(a3, b3, c3);
            
            #line 402
            npyv_storea_s16(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_s16(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_s16(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_s16(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s16 a0 = npyv_load_s16(data0 + vstep * 0);
            npyv_s16 b0 = npyv_load_s16(data1 + vstep * 0);
            npyv_s16 c0 = npyv_load_s16(data_out + vstep * 0);
            
#line 390
            npyv_s16 a1 = npyv_load_s16(data0 + vstep * 1);
            npyv_s16 b1 = npyv_load_s16(data1 + vstep * 1);
            npyv_s16 c1 = npyv_load_s16(data_out + vstep * 1);
            
#line 390
            npyv_s16 a2 = npyv_load_s16(data0 + vstep * 2);
            npyv_s16 b2 = npyv_load_s16(data1 + vstep * 2);
            npyv_s16 c2 = npyv_load_s16(data_out + vstep * 2);
            
#line 390
            npyv_s16 a3 = npyv_load_s16(data0 + vstep * 3);
            npyv_s16 b3 = npyv_load_s16(data1 + vstep * 3);
            npyv_s16 c3 = npyv_load_s16(data_out + vstep * 3);
            
            #line 397
            npyv_s16 abc0 = npyv_muladd_s16(a0, b0, c0);
            
#line 397
            npyv_s16 abc1 = npyv_muladd_s16(a1, b1, c1);
            
#line 397
            npyv_s16 abc2 = npyv_muladd_s16(a2, b2, c2);
            
#line 397
            npyv_s16 abc3 = npyv_muladd_s16(a3, b3, c3);
            
            #line 402
            npyv_store_s16(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_s16(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_s16(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_s16(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_s16 a = npyv_load_tillz_s16(data0, count);
        npyv_s16 b = npyv_load_tillz_s16(data1, count);
        npyv_s16 c = npyv_load_tillz_s16(data_out, count);
        npyv_store_till_s16(data_out, count, npyv_muladd_s16(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_short a0 = (data0[0]);
        const npy_short b0 = (data1[0]);
        const npy_short c0 = (data_out[0]);
        
#line 420
        const npy_short a1 = (data0[1]);
        const npy_short b1 = (data1[1]);
        const npy_short c1 = (data_out[1]);
        
#line 420
        const npy_short a2 = (data0[2]);
        const npy_short b2 = (data1[2]);
        const npy_short c2 = (data_out[2]);
        
#line 420
        const npy_short a3 = (data0[3]);
        const npy_short b3 = (data1[3]);
        const npy_short c3 = (data_out[3]);
        
        #line 427
        const npy_short abc0 = a0 * b0 + c0;
        
#line 427
        const npy_short abc1 = a1 * b1 + c1;
        
#line 427
        const npy_short abc2 = a2 * b2 + c2;
        
#line 427
        const npy_short abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_short a = (*data0);
        const npy_short b = (*data1);
        const npy_short c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_short

}

/* Some extra specializations for the two operand case */
static void
short_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short value0 = (*(npy_short *)dataptr[0]);
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data_out = (npy_short *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    short_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
short_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short value1 = (*(npy_short *)dataptr[1]);
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data_out = (npy_short *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    short_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
short_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short accum = 0;

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_short
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_s16;
    npyv_s16 vaccum = npyv_zero_s16();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s16 a0 = npyv_loada_s16(data0 + vstep * 0);
            npyv_s16 b0 = npyv_loada_s16(data1 + vstep * 0);
            
#line 501
            npyv_s16 a1 = npyv_loada_s16(data0 + vstep * 1);
            npyv_s16 b1 = npyv_loada_s16(data1 + vstep * 1);
            
#line 501
            npyv_s16 a2 = npyv_loada_s16(data0 + vstep * 2);
            npyv_s16 b2 = npyv_loada_s16(data1 + vstep * 2);
            
#line 501
            npyv_s16 a3 = npyv_loada_s16(data0 + vstep * 3);
            npyv_s16 b3 = npyv_loada_s16(data1 + vstep * 3);
            
            npyv_s16 ab3 = npyv_muladd_s16(a3, b3, vaccum);
            npyv_s16 ab2 = npyv_muladd_s16(a2, b2, ab3);
            npyv_s16 ab1 = npyv_muladd_s16(a1, b1, ab2);
                    vaccum = npyv_muladd_s16(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s16 a0 = npyv_load_s16(data0 + vstep * 0);
            npyv_s16 b0 = npyv_load_s16(data1 + vstep * 0);
            
#line 501
            npyv_s16 a1 = npyv_load_s16(data0 + vstep * 1);
            npyv_s16 b1 = npyv_load_s16(data1 + vstep * 1);
            
#line 501
            npyv_s16 a2 = npyv_load_s16(data0 + vstep * 2);
            npyv_s16 b2 = npyv_load_s16(data1 + vstep * 2);
            
#line 501
            npyv_s16 a3 = npyv_load_s16(data0 + vstep * 3);
            npyv_s16 b3 = npyv_load_s16(data1 + vstep * 3);
            
            npyv_s16 ab3 = npyv_muladd_s16(a3, b3, vaccum);
            npyv_s16 ab2 = npyv_muladd_s16(a2, b2, ab3);
            npyv_s16 ab1 = npyv_muladd_s16(a1, b1, ab2);
                    vaccum = npyv_muladd_s16(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_s16 a = npyv_load_tillz_s16(data0, count);
        npyv_s16 b = npyv_load_tillz_s16(data1, count);
        vaccum = npyv_muladd_s16(a, b, vaccum);
    }
    accum = npyv_sum_s16(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_short ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_short ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_short ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_short ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_short a = (*data0);
        const npy_short b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_short
    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
short_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short value0 = (*(npy_short *)dataptr[0]);
    npy_short accum = short_sum_of_arr(data1, count);
    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
short_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short value1 = (*(npy_short *)dataptr[1]);
    npy_short accum = short_sum_of_arr(data0, count);
    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + value1 * accum);
}

#elif 3 == 3 && !0

static void
short_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data2 = (npy_short *)dataptr[2];
    npy_short *data_out = (npy_short *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 3 > 3 || @complex */

static void
short_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        *(npy_short *)dataptr[nop] = (temp +
                                           (*(npy_short *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_short);
        }
#else /* complex */
#  if 3 <= 3
#    define _SUMPROD_NOP 3
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_short);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 3 */

#if 3 == 1

static NPY_GCC_OPT_3 void
short_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_short *data = (npy_short *)dataptr[0];
    npy_short accum = short_sum_of_arr(data, count);
    *((npy_short *)dataptr[1]) = (accum + (*((npy_short *)dataptr[1])));
#else
    npy_short accum_re = 0, accum_im = 0;
    npy_short *data0 = (npy_short *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_short re01 = data0[0] + data0[2];
        const npy_short re23 = data0[4] + data0[6];
        const npy_short im13 = data0[1] + data0[3];
        const npy_short im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_short *)dataptr[1])[0] += accum_re;
    ((npy_short *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 3 == 1 */

static void
short_sum_of_products_outstride0_three(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_short accum_re = 0, accum_im = 0;
#else
    npy_short accum = 0;
#endif

#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_outstride0_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        accum += (*(npy_short *)data0);
        data0 += stride0;
#  elif 3 == 2
        accum += (*(npy_short *)data0) *
                 (*(npy_short *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 3 == 3
        accum += (*(npy_short *)data0) *
                 (*(npy_short *)data1) *
                 (*(npy_short *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        accum_re += ((npy_short *)data0)[0];
        accum_im += ((npy_short *)data0)[1];
        data0 += stride0;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 3 <= 3
    ((npy_short *)dataptr[3])[0] += accum_re;
    ((npy_short *)dataptr[3])[1] += accum_im;
#  else
    ((npy_short *)dataptr[nop])[0] += accum_re;
    ((npy_short *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 3 <= 3
    *((npy_short *)dataptr[3]) = (accum +
                                    (*((npy_short *)dataptr[3])));
#  else
    *((npy_short *)dataptr[nop]) = (accum +
                                    (*((npy_short *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
short_sum_of_products_any(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data_out = dataptr[1000];
    npy_intp stride_out = strides[1000];
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_any (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        *(npy_short *)data_out = ((*(npy_short *)data0) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1000 == 2
        *(npy_short *)data_out = ((*(npy_short *)data0) *
                                         (*(npy_short *)data1) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1000 == 3
        *(npy_short *)data_out = ((*(npy_short *)data0) *
                                         (*(npy_short *)data1) *
                                         (*(npy_short *)data2) +
                                         (*(npy_short *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        *(npy_short *)dataptr[nop] = (temp +
                                           (*(npy_short *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        ((npy_short *)data_out)[0] = ((npy_short *)data0)[0] +
                                         ((npy_short *)data_out)[0];
        ((npy_short *)data_out)[1] = ((npy_short *)data0)[1] +
                                         ((npy_short *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1000 == 1

static void
short_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data_out = (npy_short *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_short *)data_out + 2*6)[0] =
                                    ((npy_short *)data0 + 2*6)[0] +
                                    ((npy_short *)data_out + 2*6)[0];
            ((npy_short *)data_out + 2*6)[1] =
                                    ((npy_short *)data0 + 2*6)[1] +
                                    ((npy_short *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_short *)data_out + 2*5)[0] =
                                    ((npy_short *)data0 + 2*5)[0] +
                                    ((npy_short *)data_out + 2*5)[0];
            ((npy_short *)data_out + 2*5)[1] =
                                    ((npy_short *)data0 + 2*5)[1] +
                                    ((npy_short *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_short *)data_out + 2*4)[0] =
                                    ((npy_short *)data0 + 2*4)[0] +
                                    ((npy_short *)data_out + 2*4)[0];
            ((npy_short *)data_out + 2*4)[1] =
                                    ((npy_short *)data0 + 2*4)[1] +
                                    ((npy_short *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_short *)data_out + 2*3)[0] =
                                    ((npy_short *)data0 + 2*3)[0] +
                                    ((npy_short *)data_out + 2*3)[0];
            ((npy_short *)data_out + 2*3)[1] =
                                    ((npy_short *)data0 + 2*3)[1] +
                                    ((npy_short *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_short *)data_out + 2*2)[0] =
                                    ((npy_short *)data0 + 2*2)[0] +
                                    ((npy_short *)data_out + 2*2)[0];
            ((npy_short *)data_out + 2*2)[1] =
                                    ((npy_short *)data0 + 2*2)[1] +
                                    ((npy_short *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_short *)data_out + 2*1)[0] =
                                    ((npy_short *)data0 + 2*1)[0] +
                                    ((npy_short *)data_out + 2*1)[0];
            ((npy_short *)data_out + 2*1)[1] =
                                    ((npy_short *)data0 + 2*1)[1] +
                                    ((npy_short *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_short *)data_out + 2*0)[0] =
                                    ((npy_short *)data0 + 2*0)[0] +
                                    ((npy_short *)data_out + 2*0)[0];
            ((npy_short *)data_out + 2*0)[1] =
                                    ((npy_short *)data0 + 2*0)[1] +
                                    ((npy_short *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_short *)data_out + 2*0)[0] =
                                ((npy_short *)data0 + 2*0)[0] +
                                ((npy_short *)data_out + 2*0)[0];
        ((npy_short *)data_out + 2*0)[1] =
                                ((npy_short *)data0 + 2*0)[1] +
                                ((npy_short *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_short *)data_out + 2*1)[0] =
                                ((npy_short *)data0 + 2*1)[0] +
                                ((npy_short *)data_out + 2*1)[0];
        ((npy_short *)data_out + 2*1)[1] =
                                ((npy_short *)data0 + 2*1)[1] +
                                ((npy_short *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_short *)data_out + 2*2)[0] =
                                ((npy_short *)data0 + 2*2)[0] +
                                ((npy_short *)data_out + 2*2)[0];
        ((npy_short *)data_out + 2*2)[1] =
                                ((npy_short *)data0 + 2*2)[1] +
                                ((npy_short *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_short *)data_out + 2*3)[0] =
                                ((npy_short *)data0 + 2*3)[0] +
                                ((npy_short *)data_out + 2*3)[0];
        ((npy_short *)data_out + 2*3)[1] =
                                ((npy_short *)data0 + 2*3)[1] +
                                ((npy_short *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_short *)data_out + 2*4)[0] =
                                ((npy_short *)data0 + 2*4)[0] +
                                ((npy_short *)data_out + 2*4)[0];
        ((npy_short *)data_out + 2*4)[1] =
                                ((npy_short *)data0 + 2*4)[1] +
                                ((npy_short *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_short *)data_out + 2*5)[0] =
                                ((npy_short *)data0 + 2*5)[0] +
                                ((npy_short *)data_out + 2*5)[0];
        ((npy_short *)data_out + 2*5)[1] =
                                ((npy_short *)data0 + 2*5)[1] +
                                ((npy_short *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_short *)data_out + 2*6)[0] =
                                ((npy_short *)data0 + 2*6)[0] +
                                ((npy_short *)data_out + 2*6)[0];
        ((npy_short *)data_out + 2*6)[1] =
                                ((npy_short *)data0 + 2*6)[1] +
                                ((npy_short *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_short *)data_out + 2*7)[0] =
                                ((npy_short *)data0 + 2*7)[0] +
                                ((npy_short *)data_out + 2*7)[0];
        ((npy_short *)data_out + 2*7)[1] =
                                ((npy_short *)data0 + 2*7)[1] +
                                ((npy_short *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1000 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
short_sum_of_products_muladd(npy_short *data, npy_short *data_out, npy_short scalar, npy_intp count)
{
#if 0 // NPYV check for npy_short
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s16;
    const npyv_s16 v_scalar = npyv_setall_s16(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s16 b0 = npyv_loada_s16(data + vstep * 0);
            npyv_s16 c0 = npyv_loada_s16(data_out + vstep * 0);
            
#line 312
            npyv_s16 b1 = npyv_loada_s16(data + vstep * 1);
            npyv_s16 c1 = npyv_loada_s16(data_out + vstep * 1);
            
#line 312
            npyv_s16 b2 = npyv_loada_s16(data + vstep * 2);
            npyv_s16 c2 = npyv_loada_s16(data_out + vstep * 2);
            
#line 312
            npyv_s16 b3 = npyv_loada_s16(data + vstep * 3);
            npyv_s16 c3 = npyv_loada_s16(data_out + vstep * 3);
            
            #line 318
            npyv_s16 abc0 = npyv_muladd_s16(v_scalar, b0, c0);
            
#line 318
            npyv_s16 abc1 = npyv_muladd_s16(v_scalar, b1, c1);
            
#line 318
            npyv_s16 abc2 = npyv_muladd_s16(v_scalar, b2, c2);
            
#line 318
            npyv_s16 abc3 = npyv_muladd_s16(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_s16(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_s16(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_s16(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_s16(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s16 b0 = npyv_load_s16(data + vstep * 0);
            npyv_s16 c0 = npyv_load_s16(data_out + vstep * 0);
            
#line 312
            npyv_s16 b1 = npyv_load_s16(data + vstep * 1);
            npyv_s16 c1 = npyv_load_s16(data_out + vstep * 1);
            
#line 312
            npyv_s16 b2 = npyv_load_s16(data + vstep * 2);
            npyv_s16 c2 = npyv_load_s16(data_out + vstep * 2);
            
#line 312
            npyv_s16 b3 = npyv_load_s16(data + vstep * 3);
            npyv_s16 c3 = npyv_load_s16(data_out + vstep * 3);
            
            #line 318
            npyv_s16 abc0 = npyv_muladd_s16(v_scalar, b0, c0);
            
#line 318
            npyv_s16 abc1 = npyv_muladd_s16(v_scalar, b1, c1);
            
#line 318
            npyv_s16 abc2 = npyv_muladd_s16(v_scalar, b2, c2);
            
#line 318
            npyv_s16 abc3 = npyv_muladd_s16(v_scalar, b3, c3);
            
            #line 323
            npyv_store_s16(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_s16(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_s16(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_s16(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_s16 a = npyv_load_tillz_s16(data, count);
        npyv_s16 b = npyv_load_tillz_s16(data_out, count);
        npyv_store_till_s16(data_out, count, npyv_muladd_s16(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_short b0 = (data[0]);
        const npy_short c0 = (data_out[0]);
        
#line 340
        const npy_short b1 = (data[1]);
        const npy_short c1 = (data_out[1]);
        
#line 340
        const npy_short b2 = (data[2]);
        const npy_short c2 = (data_out[2]);
        
#line 340
        const npy_short b3 = (data[3]);
        const npy_short c3 = (data_out[3]);
        
        #line 346
        const npy_short abc0 = scalar * b0 + c0;
        
#line 346
        const npy_short abc1 = scalar * b1 + c1;
        
#line 346
        const npy_short abc2 = scalar * b2 + c2;
        
#line 346
        const npy_short abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_short b = (*data);
        const npy_short c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_short
}

static void
short_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data_out = (npy_short *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_short
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s16;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s16 a0 = npyv_loada_s16(data0 + vstep * 0);
            npyv_s16 b0 = npyv_loada_s16(data1 + vstep * 0);
            npyv_s16 c0 = npyv_loada_s16(data_out + vstep * 0);
            
#line 390
            npyv_s16 a1 = npyv_loada_s16(data0 + vstep * 1);
            npyv_s16 b1 = npyv_loada_s16(data1 + vstep * 1);
            npyv_s16 c1 = npyv_loada_s16(data_out + vstep * 1);
            
#line 390
            npyv_s16 a2 = npyv_loada_s16(data0 + vstep * 2);
            npyv_s16 b2 = npyv_loada_s16(data1 + vstep * 2);
            npyv_s16 c2 = npyv_loada_s16(data_out + vstep * 2);
            
#line 390
            npyv_s16 a3 = npyv_loada_s16(data0 + vstep * 3);
            npyv_s16 b3 = npyv_loada_s16(data1 + vstep * 3);
            npyv_s16 c3 = npyv_loada_s16(data_out + vstep * 3);
            
            #line 397
            npyv_s16 abc0 = npyv_muladd_s16(a0, b0, c0);
            
#line 397
            npyv_s16 abc1 = npyv_muladd_s16(a1, b1, c1);
            
#line 397
            npyv_s16 abc2 = npyv_muladd_s16(a2, b2, c2);
            
#line 397
            npyv_s16 abc3 = npyv_muladd_s16(a3, b3, c3);
            
            #line 402
            npyv_storea_s16(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_s16(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_s16(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_s16(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s16 a0 = npyv_load_s16(data0 + vstep * 0);
            npyv_s16 b0 = npyv_load_s16(data1 + vstep * 0);
            npyv_s16 c0 = npyv_load_s16(data_out + vstep * 0);
            
#line 390
            npyv_s16 a1 = npyv_load_s16(data0 + vstep * 1);
            npyv_s16 b1 = npyv_load_s16(data1 + vstep * 1);
            npyv_s16 c1 = npyv_load_s16(data_out + vstep * 1);
            
#line 390
            npyv_s16 a2 = npyv_load_s16(data0 + vstep * 2);
            npyv_s16 b2 = npyv_load_s16(data1 + vstep * 2);
            npyv_s16 c2 = npyv_load_s16(data_out + vstep * 2);
            
#line 390
            npyv_s16 a3 = npyv_load_s16(data0 + vstep * 3);
            npyv_s16 b3 = npyv_load_s16(data1 + vstep * 3);
            npyv_s16 c3 = npyv_load_s16(data_out + vstep * 3);
            
            #line 397
            npyv_s16 abc0 = npyv_muladd_s16(a0, b0, c0);
            
#line 397
            npyv_s16 abc1 = npyv_muladd_s16(a1, b1, c1);
            
#line 397
            npyv_s16 abc2 = npyv_muladd_s16(a2, b2, c2);
            
#line 397
            npyv_s16 abc3 = npyv_muladd_s16(a3, b3, c3);
            
            #line 402
            npyv_store_s16(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_s16(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_s16(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_s16(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_s16 a = npyv_load_tillz_s16(data0, count);
        npyv_s16 b = npyv_load_tillz_s16(data1, count);
        npyv_s16 c = npyv_load_tillz_s16(data_out, count);
        npyv_store_till_s16(data_out, count, npyv_muladd_s16(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_short a0 = (data0[0]);
        const npy_short b0 = (data1[0]);
        const npy_short c0 = (data_out[0]);
        
#line 420
        const npy_short a1 = (data0[1]);
        const npy_short b1 = (data1[1]);
        const npy_short c1 = (data_out[1]);
        
#line 420
        const npy_short a2 = (data0[2]);
        const npy_short b2 = (data1[2]);
        const npy_short c2 = (data_out[2]);
        
#line 420
        const npy_short a3 = (data0[3]);
        const npy_short b3 = (data1[3]);
        const npy_short c3 = (data_out[3]);
        
        #line 427
        const npy_short abc0 = a0 * b0 + c0;
        
#line 427
        const npy_short abc1 = a1 * b1 + c1;
        
#line 427
        const npy_short abc2 = a2 * b2 + c2;
        
#line 427
        const npy_short abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_short a = (*data0);
        const npy_short b = (*data1);
        const npy_short c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_short

}

/* Some extra specializations for the two operand case */
static void
short_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short value0 = (*(npy_short *)dataptr[0]);
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data_out = (npy_short *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    short_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
short_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short value1 = (*(npy_short *)dataptr[1]);
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data_out = (npy_short *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    short_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
short_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short accum = 0;

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_short
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_s16;
    npyv_s16 vaccum = npyv_zero_s16();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s16 a0 = npyv_loada_s16(data0 + vstep * 0);
            npyv_s16 b0 = npyv_loada_s16(data1 + vstep * 0);
            
#line 501
            npyv_s16 a1 = npyv_loada_s16(data0 + vstep * 1);
            npyv_s16 b1 = npyv_loada_s16(data1 + vstep * 1);
            
#line 501
            npyv_s16 a2 = npyv_loada_s16(data0 + vstep * 2);
            npyv_s16 b2 = npyv_loada_s16(data1 + vstep * 2);
            
#line 501
            npyv_s16 a3 = npyv_loada_s16(data0 + vstep * 3);
            npyv_s16 b3 = npyv_loada_s16(data1 + vstep * 3);
            
            npyv_s16 ab3 = npyv_muladd_s16(a3, b3, vaccum);
            npyv_s16 ab2 = npyv_muladd_s16(a2, b2, ab3);
            npyv_s16 ab1 = npyv_muladd_s16(a1, b1, ab2);
                    vaccum = npyv_muladd_s16(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s16 a0 = npyv_load_s16(data0 + vstep * 0);
            npyv_s16 b0 = npyv_load_s16(data1 + vstep * 0);
            
#line 501
            npyv_s16 a1 = npyv_load_s16(data0 + vstep * 1);
            npyv_s16 b1 = npyv_load_s16(data1 + vstep * 1);
            
#line 501
            npyv_s16 a2 = npyv_load_s16(data0 + vstep * 2);
            npyv_s16 b2 = npyv_load_s16(data1 + vstep * 2);
            
#line 501
            npyv_s16 a3 = npyv_load_s16(data0 + vstep * 3);
            npyv_s16 b3 = npyv_load_s16(data1 + vstep * 3);
            
            npyv_s16 ab3 = npyv_muladd_s16(a3, b3, vaccum);
            npyv_s16 ab2 = npyv_muladd_s16(a2, b2, ab3);
            npyv_s16 ab1 = npyv_muladd_s16(a1, b1, ab2);
                    vaccum = npyv_muladd_s16(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_s16 a = npyv_load_tillz_s16(data0, count);
        npyv_s16 b = npyv_load_tillz_s16(data1, count);
        vaccum = npyv_muladd_s16(a, b, vaccum);
    }
    accum = npyv_sum_s16(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_short ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_short ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_short ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_short ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_short a = (*data0);
        const npy_short b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_short
    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
short_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short value0 = (*(npy_short *)dataptr[0]);
    npy_short accum = short_sum_of_arr(data1, count);
    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
short_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short value1 = (*(npy_short *)dataptr[1]);
    npy_short accum = short_sum_of_arr(data0, count);
    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + value1 * accum);
}

#elif 1000 == 3 && !0

static void
short_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_short *data0 = (npy_short *)dataptr[0];
    npy_short *data1 = (npy_short *)dataptr[1];
    npy_short *data2 = (npy_short *)dataptr[2];
    npy_short *data_out = (npy_short *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1000 > 3 || @complex */

static void
short_sum_of_products_contig_any(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        *(npy_short *)dataptr[nop] = (temp +
                                           (*(npy_short *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_short);
        }
#else /* complex */
#  if 1000 <= 3
#    define _SUMPROD_NOP 1000
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_short);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1000 */

#if 1000 == 1

static NPY_GCC_OPT_3 void
short_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_short *data = (npy_short *)dataptr[0];
    npy_short accum = short_sum_of_arr(data, count);
    *((npy_short *)dataptr[1]) = (accum + (*((npy_short *)dataptr[1])));
#else
    npy_short accum_re = 0, accum_im = 0;
    npy_short *data0 = (npy_short *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_short re01 = data0[0] + data0[2];
        const npy_short re23 = data0[4] + data0[6];
        const npy_short im13 = data0[1] + data0[3];
        const npy_short im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_short *)dataptr[1])[0] += accum_re;
    ((npy_short *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 1000 == 1 */

static void
short_sum_of_products_outstride0_any(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_short accum_re = 0, accum_im = 0;
#else
    npy_short accum = 0;
#endif

#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_outstride0_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        accum += (*(npy_short *)data0);
        data0 += stride0;
#  elif 1000 == 2
        accum += (*(npy_short *)data0) *
                 (*(npy_short *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1000 == 3
        accum += (*(npy_short *)data0) *
                 (*(npy_short *)data1) *
                 (*(npy_short *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_short temp = (*(npy_short *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_short *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        accum_re += ((npy_short *)data0)[0];
        accum_im += ((npy_short *)data0)[1];
        data0 += stride0;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_short re, im, tmp;
        int i;
        re = ((npy_short *)dataptr[0])[0];
        im = ((npy_short *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_short *)dataptr[i])[0] -
                  im * ((npy_short *)dataptr[i])[1];
            im = re * ((npy_short *)dataptr[i])[1] +
                 im * ((npy_short *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1000 <= 3
    ((npy_short *)dataptr[1000])[0] += accum_re;
    ((npy_short *)dataptr[1000])[1] += accum_im;
#  else
    ((npy_short *)dataptr[nop])[0] += accum_re;
    ((npy_short *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1000 <= 3
    *((npy_short *)dataptr[1000]) = (accum +
                                    (*((npy_short *)dataptr[1000])));
#  else
    *((npy_short *)dataptr[nop]) = (accum +
                                    (*((npy_short *)dataptr[nop])));
#  endif
#endif

}




#line 74

#if !0
static NPY_GCC_OPT_3 npy_int int_sum_of_arr(npy_int *data, npy_intp count)
{
    npy_int accum = 0;
#if 0 // NPYV check for npy_int
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data);
    const int vstep = npyv_nlanes_s32;
    npyv_s32 vaccum = npyv_zero_s32();
    const npy_intp vstepx4 = vstep * 4;

    #line 91
    if(is_aligned) {
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
            #line 96
            npyv_s32 a0 = npyv_loada_s32(data + vstep * 0);
            
#line 96
            npyv_s32 a1 = npyv_loada_s32(data + vstep * 1);
            
#line 96
            npyv_s32 a2 = npyv_loada_s32(data + vstep * 2);
            
#line 96
            npyv_s32 a3 = npyv_loada_s32(data + vstep * 3);
            
            npyv_s32 a01   = npyv_add_s32(a0, a1);
            npyv_s32 a23   = npyv_add_s32(a2, a3);
            npyv_s32 a0123 = npyv_add_s32(a01, a23);
                      vaccum = npyv_add_s32(a0123, vaccum);
        }
    }
    
#line 91
    else {
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
            #line 96
            npyv_s32 a0 = npyv_load_s32(data + vstep * 0);
            
#line 96
            npyv_s32 a1 = npyv_load_s32(data + vstep * 1);
            
#line 96
            npyv_s32 a2 = npyv_load_s32(data + vstep * 2);
            
#line 96
            npyv_s32 a3 = npyv_load_s32(data + vstep * 3);
            
            npyv_s32 a01   = npyv_add_s32(a0, a1);
            npyv_s32 a23   = npyv_add_s32(a2, a3);
            npyv_s32 a0123 = npyv_add_s32(a01, a23);
                      vaccum = npyv_add_s32(a0123, vaccum);
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep) {
        npyv_s32 a = npyv_load_tillz_s32(data, count);
        vaccum = npyv_add_s32(a, vaccum);
    }
    accum = npyv_sum_s32(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data += 4) {
        const npy_int a01 = (*data) + (data[1]);
        const npy_int a23 = (data[2]) + (data[3]);
        accum +=  a01 + a23;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data++) {
        accum += (*data);
    }
#endif // NPYV check for npy_int
    return accum;
}
#endif

#line 131
static void
int_sum_of_products_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1 == 1) || (1 <= 3 && !0)
    char *data_out = dataptr[1];
    npy_intp stride_out = strides[1];
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_one (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        *(npy_int *)data_out = ((*(npy_int *)data0) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1 == 2
        *(npy_int *)data_out = ((*(npy_int *)data0) *
                                         (*(npy_int *)data1) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1 == 3
        *(npy_int *)data_out = ((*(npy_int *)data0) *
                                         (*(npy_int *)data1) *
                                         (*(npy_int *)data2) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        *(npy_int *)dataptr[nop] = (temp +
                                           (*(npy_int *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        ((npy_int *)data_out)[0] = ((npy_int *)data0)[0] +
                                         ((npy_int *)data_out)[0];
        ((npy_int *)data_out)[1] = ((npy_int *)data0)[1] +
                                         ((npy_int *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1 == 1

static void
int_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data_out = (npy_int *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_int *)data_out + 2*6)[0] =
                                    ((npy_int *)data0 + 2*6)[0] +
                                    ((npy_int *)data_out + 2*6)[0];
            ((npy_int *)data_out + 2*6)[1] =
                                    ((npy_int *)data0 + 2*6)[1] +
                                    ((npy_int *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_int *)data_out + 2*5)[0] =
                                    ((npy_int *)data0 + 2*5)[0] +
                                    ((npy_int *)data_out + 2*5)[0];
            ((npy_int *)data_out + 2*5)[1] =
                                    ((npy_int *)data0 + 2*5)[1] +
                                    ((npy_int *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_int *)data_out + 2*4)[0] =
                                    ((npy_int *)data0 + 2*4)[0] +
                                    ((npy_int *)data_out + 2*4)[0];
            ((npy_int *)data_out + 2*4)[1] =
                                    ((npy_int *)data0 + 2*4)[1] +
                                    ((npy_int *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_int *)data_out + 2*3)[0] =
                                    ((npy_int *)data0 + 2*3)[0] +
                                    ((npy_int *)data_out + 2*3)[0];
            ((npy_int *)data_out + 2*3)[1] =
                                    ((npy_int *)data0 + 2*3)[1] +
                                    ((npy_int *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_int *)data_out + 2*2)[0] =
                                    ((npy_int *)data0 + 2*2)[0] +
                                    ((npy_int *)data_out + 2*2)[0];
            ((npy_int *)data_out + 2*2)[1] =
                                    ((npy_int *)data0 + 2*2)[1] +
                                    ((npy_int *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_int *)data_out + 2*1)[0] =
                                    ((npy_int *)data0 + 2*1)[0] +
                                    ((npy_int *)data_out + 2*1)[0];
            ((npy_int *)data_out + 2*1)[1] =
                                    ((npy_int *)data0 + 2*1)[1] +
                                    ((npy_int *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_int *)data_out + 2*0)[0] =
                                    ((npy_int *)data0 + 2*0)[0] +
                                    ((npy_int *)data_out + 2*0)[0];
            ((npy_int *)data_out + 2*0)[1] =
                                    ((npy_int *)data0 + 2*0)[1] +
                                    ((npy_int *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_int *)data_out + 2*0)[0] =
                                ((npy_int *)data0 + 2*0)[0] +
                                ((npy_int *)data_out + 2*0)[0];
        ((npy_int *)data_out + 2*0)[1] =
                                ((npy_int *)data0 + 2*0)[1] +
                                ((npy_int *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_int *)data_out + 2*1)[0] =
                                ((npy_int *)data0 + 2*1)[0] +
                                ((npy_int *)data_out + 2*1)[0];
        ((npy_int *)data_out + 2*1)[1] =
                                ((npy_int *)data0 + 2*1)[1] +
                                ((npy_int *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_int *)data_out + 2*2)[0] =
                                ((npy_int *)data0 + 2*2)[0] +
                                ((npy_int *)data_out + 2*2)[0];
        ((npy_int *)data_out + 2*2)[1] =
                                ((npy_int *)data0 + 2*2)[1] +
                                ((npy_int *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_int *)data_out + 2*3)[0] =
                                ((npy_int *)data0 + 2*3)[0] +
                                ((npy_int *)data_out + 2*3)[0];
        ((npy_int *)data_out + 2*3)[1] =
                                ((npy_int *)data0 + 2*3)[1] +
                                ((npy_int *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_int *)data_out + 2*4)[0] =
                                ((npy_int *)data0 + 2*4)[0] +
                                ((npy_int *)data_out + 2*4)[0];
        ((npy_int *)data_out + 2*4)[1] =
                                ((npy_int *)data0 + 2*4)[1] +
                                ((npy_int *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_int *)data_out + 2*5)[0] =
                                ((npy_int *)data0 + 2*5)[0] +
                                ((npy_int *)data_out + 2*5)[0];
        ((npy_int *)data_out + 2*5)[1] =
                                ((npy_int *)data0 + 2*5)[1] +
                                ((npy_int *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_int *)data_out + 2*6)[0] =
                                ((npy_int *)data0 + 2*6)[0] +
                                ((npy_int *)data_out + 2*6)[0];
        ((npy_int *)data_out + 2*6)[1] =
                                ((npy_int *)data0 + 2*6)[1] +
                                ((npy_int *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_int *)data_out + 2*7)[0] =
                                ((npy_int *)data0 + 2*7)[0] +
                                ((npy_int *)data_out + 2*7)[0];
        ((npy_int *)data_out + 2*7)[1] =
                                ((npy_int *)data0 + 2*7)[1] +
                                ((npy_int *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
int_sum_of_products_muladd(npy_int *data, npy_int *data_out, npy_int scalar, npy_intp count)
{
#if 0 // NPYV check for npy_int
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s32;
    const npyv_s32 v_scalar = npyv_setall_s32(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s32 b0 = npyv_loada_s32(data + vstep * 0);
            npyv_s32 c0 = npyv_loada_s32(data_out + vstep * 0);
            
#line 312
            npyv_s32 b1 = npyv_loada_s32(data + vstep * 1);
            npyv_s32 c1 = npyv_loada_s32(data_out + vstep * 1);
            
#line 312
            npyv_s32 b2 = npyv_loada_s32(data + vstep * 2);
            npyv_s32 c2 = npyv_loada_s32(data_out + vstep * 2);
            
#line 312
            npyv_s32 b3 = npyv_loada_s32(data + vstep * 3);
            npyv_s32 c3 = npyv_loada_s32(data_out + vstep * 3);
            
            #line 318
            npyv_s32 abc0 = npyv_muladd_s32(v_scalar, b0, c0);
            
#line 318
            npyv_s32 abc1 = npyv_muladd_s32(v_scalar, b1, c1);
            
#line 318
            npyv_s32 abc2 = npyv_muladd_s32(v_scalar, b2, c2);
            
#line 318
            npyv_s32 abc3 = npyv_muladd_s32(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_s32(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_s32(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_s32(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_s32(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s32 b0 = npyv_load_s32(data + vstep * 0);
            npyv_s32 c0 = npyv_load_s32(data_out + vstep * 0);
            
#line 312
            npyv_s32 b1 = npyv_load_s32(data + vstep * 1);
            npyv_s32 c1 = npyv_load_s32(data_out + vstep * 1);
            
#line 312
            npyv_s32 b2 = npyv_load_s32(data + vstep * 2);
            npyv_s32 c2 = npyv_load_s32(data_out + vstep * 2);
            
#line 312
            npyv_s32 b3 = npyv_load_s32(data + vstep * 3);
            npyv_s32 c3 = npyv_load_s32(data_out + vstep * 3);
            
            #line 318
            npyv_s32 abc0 = npyv_muladd_s32(v_scalar, b0, c0);
            
#line 318
            npyv_s32 abc1 = npyv_muladd_s32(v_scalar, b1, c1);
            
#line 318
            npyv_s32 abc2 = npyv_muladd_s32(v_scalar, b2, c2);
            
#line 318
            npyv_s32 abc3 = npyv_muladd_s32(v_scalar, b3, c3);
            
            #line 323
            npyv_store_s32(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_s32(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_s32(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_s32(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_s32 a = npyv_load_tillz_s32(data, count);
        npyv_s32 b = npyv_load_tillz_s32(data_out, count);
        npyv_store_till_s32(data_out, count, npyv_muladd_s32(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_int b0 = (data[0]);
        const npy_int c0 = (data_out[0]);
        
#line 340
        const npy_int b1 = (data[1]);
        const npy_int c1 = (data_out[1]);
        
#line 340
        const npy_int b2 = (data[2]);
        const npy_int c2 = (data_out[2]);
        
#line 340
        const npy_int b3 = (data[3]);
        const npy_int c3 = (data_out[3]);
        
        #line 346
        const npy_int abc0 = scalar * b0 + c0;
        
#line 346
        const npy_int abc1 = scalar * b1 + c1;
        
#line 346
        const npy_int abc2 = scalar * b2 + c2;
        
#line 346
        const npy_int abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_int b = (*data);
        const npy_int c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_int
}

static void
int_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data_out = (npy_int *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_int
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s32;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s32 a0 = npyv_loada_s32(data0 + vstep * 0);
            npyv_s32 b0 = npyv_loada_s32(data1 + vstep * 0);
            npyv_s32 c0 = npyv_loada_s32(data_out + vstep * 0);
            
#line 390
            npyv_s32 a1 = npyv_loada_s32(data0 + vstep * 1);
            npyv_s32 b1 = npyv_loada_s32(data1 + vstep * 1);
            npyv_s32 c1 = npyv_loada_s32(data_out + vstep * 1);
            
#line 390
            npyv_s32 a2 = npyv_loada_s32(data0 + vstep * 2);
            npyv_s32 b2 = npyv_loada_s32(data1 + vstep * 2);
            npyv_s32 c2 = npyv_loada_s32(data_out + vstep * 2);
            
#line 390
            npyv_s32 a3 = npyv_loada_s32(data0 + vstep * 3);
            npyv_s32 b3 = npyv_loada_s32(data1 + vstep * 3);
            npyv_s32 c3 = npyv_loada_s32(data_out + vstep * 3);
            
            #line 397
            npyv_s32 abc0 = npyv_muladd_s32(a0, b0, c0);
            
#line 397
            npyv_s32 abc1 = npyv_muladd_s32(a1, b1, c1);
            
#line 397
            npyv_s32 abc2 = npyv_muladd_s32(a2, b2, c2);
            
#line 397
            npyv_s32 abc3 = npyv_muladd_s32(a3, b3, c3);
            
            #line 402
            npyv_storea_s32(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_s32(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_s32(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_s32(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s32 a0 = npyv_load_s32(data0 + vstep * 0);
            npyv_s32 b0 = npyv_load_s32(data1 + vstep * 0);
            npyv_s32 c0 = npyv_load_s32(data_out + vstep * 0);
            
#line 390
            npyv_s32 a1 = npyv_load_s32(data0 + vstep * 1);
            npyv_s32 b1 = npyv_load_s32(data1 + vstep * 1);
            npyv_s32 c1 = npyv_load_s32(data_out + vstep * 1);
            
#line 390
            npyv_s32 a2 = npyv_load_s32(data0 + vstep * 2);
            npyv_s32 b2 = npyv_load_s32(data1 + vstep * 2);
            npyv_s32 c2 = npyv_load_s32(data_out + vstep * 2);
            
#line 390
            npyv_s32 a3 = npyv_load_s32(data0 + vstep * 3);
            npyv_s32 b3 = npyv_load_s32(data1 + vstep * 3);
            npyv_s32 c3 = npyv_load_s32(data_out + vstep * 3);
            
            #line 397
            npyv_s32 abc0 = npyv_muladd_s32(a0, b0, c0);
            
#line 397
            npyv_s32 abc1 = npyv_muladd_s32(a1, b1, c1);
            
#line 397
            npyv_s32 abc2 = npyv_muladd_s32(a2, b2, c2);
            
#line 397
            npyv_s32 abc3 = npyv_muladd_s32(a3, b3, c3);
            
            #line 402
            npyv_store_s32(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_s32(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_s32(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_s32(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_s32 a = npyv_load_tillz_s32(data0, count);
        npyv_s32 b = npyv_load_tillz_s32(data1, count);
        npyv_s32 c = npyv_load_tillz_s32(data_out, count);
        npyv_store_till_s32(data_out, count, npyv_muladd_s32(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_int a0 = (data0[0]);
        const npy_int b0 = (data1[0]);
        const npy_int c0 = (data_out[0]);
        
#line 420
        const npy_int a1 = (data0[1]);
        const npy_int b1 = (data1[1]);
        const npy_int c1 = (data_out[1]);
        
#line 420
        const npy_int a2 = (data0[2]);
        const npy_int b2 = (data1[2]);
        const npy_int c2 = (data_out[2]);
        
#line 420
        const npy_int a3 = (data0[3]);
        const npy_int b3 = (data1[3]);
        const npy_int c3 = (data_out[3]);
        
        #line 427
        const npy_int abc0 = a0 * b0 + c0;
        
#line 427
        const npy_int abc1 = a1 * b1 + c1;
        
#line 427
        const npy_int abc2 = a2 * b2 + c2;
        
#line 427
        const npy_int abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_int a = (*data0);
        const npy_int b = (*data1);
        const npy_int c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_int

}

/* Some extra specializations for the two operand case */
static void
int_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int value0 = (*(npy_int *)dataptr[0]);
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data_out = (npy_int *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    int_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
int_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int value1 = (*(npy_int *)dataptr[1]);
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data_out = (npy_int *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    int_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
int_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int accum = 0;

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_int
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_s32;
    npyv_s32 vaccum = npyv_zero_s32();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s32 a0 = npyv_loada_s32(data0 + vstep * 0);
            npyv_s32 b0 = npyv_loada_s32(data1 + vstep * 0);
            
#line 501
            npyv_s32 a1 = npyv_loada_s32(data0 + vstep * 1);
            npyv_s32 b1 = npyv_loada_s32(data1 + vstep * 1);
            
#line 501
            npyv_s32 a2 = npyv_loada_s32(data0 + vstep * 2);
            npyv_s32 b2 = npyv_loada_s32(data1 + vstep * 2);
            
#line 501
            npyv_s32 a3 = npyv_loada_s32(data0 + vstep * 3);
            npyv_s32 b3 = npyv_loada_s32(data1 + vstep * 3);
            
            npyv_s32 ab3 = npyv_muladd_s32(a3, b3, vaccum);
            npyv_s32 ab2 = npyv_muladd_s32(a2, b2, ab3);
            npyv_s32 ab1 = npyv_muladd_s32(a1, b1, ab2);
                    vaccum = npyv_muladd_s32(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s32 a0 = npyv_load_s32(data0 + vstep * 0);
            npyv_s32 b0 = npyv_load_s32(data1 + vstep * 0);
            
#line 501
            npyv_s32 a1 = npyv_load_s32(data0 + vstep * 1);
            npyv_s32 b1 = npyv_load_s32(data1 + vstep * 1);
            
#line 501
            npyv_s32 a2 = npyv_load_s32(data0 + vstep * 2);
            npyv_s32 b2 = npyv_load_s32(data1 + vstep * 2);
            
#line 501
            npyv_s32 a3 = npyv_load_s32(data0 + vstep * 3);
            npyv_s32 b3 = npyv_load_s32(data1 + vstep * 3);
            
            npyv_s32 ab3 = npyv_muladd_s32(a3, b3, vaccum);
            npyv_s32 ab2 = npyv_muladd_s32(a2, b2, ab3);
            npyv_s32 ab1 = npyv_muladd_s32(a1, b1, ab2);
                    vaccum = npyv_muladd_s32(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_s32 a = npyv_load_tillz_s32(data0, count);
        npyv_s32 b = npyv_load_tillz_s32(data1, count);
        vaccum = npyv_muladd_s32(a, b, vaccum);
    }
    accum = npyv_sum_s32(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_int ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_int ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_int ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_int ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_int a = (*data0);
        const npy_int b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_int
    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
int_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int value0 = (*(npy_int *)dataptr[0]);
    npy_int accum = int_sum_of_arr(data1, count);
    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
int_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int value1 = (*(npy_int *)dataptr[1]);
    npy_int accum = int_sum_of_arr(data0, count);
    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + value1 * accum);
}

#elif 1 == 3 && !0

static void
int_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data2 = (npy_int *)dataptr[2];
    npy_int *data_out = (npy_int *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1 > 3 || @complex */

static void
int_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        *(npy_int *)dataptr[nop] = (temp +
                                           (*(npy_int *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_int);
        }
#else /* complex */
#  if 1 <= 3
#    define _SUMPROD_NOP 1
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_int);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1 */

#if 1 == 1

static NPY_GCC_OPT_3 void
int_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_int *data = (npy_int *)dataptr[0];
    npy_int accum = int_sum_of_arr(data, count);
    *((npy_int *)dataptr[1]) = (accum + (*((npy_int *)dataptr[1])));
#else
    npy_int accum_re = 0, accum_im = 0;
    npy_int *data0 = (npy_int *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_int re01 = data0[0] + data0[2];
        const npy_int re23 = data0[4] + data0[6];
        const npy_int im13 = data0[1] + data0[3];
        const npy_int im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_int *)dataptr[1])[0] += accum_re;
    ((npy_int *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 1 == 1 */

static void
int_sum_of_products_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_int accum_re = 0, accum_im = 0;
#else
    npy_int accum = 0;
#endif

#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_outstride0_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        accum += (*(npy_int *)data0);
        data0 += stride0;
#  elif 1 == 2
        accum += (*(npy_int *)data0) *
                 (*(npy_int *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1 == 3
        accum += (*(npy_int *)data0) *
                 (*(npy_int *)data1) *
                 (*(npy_int *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        accum_re += ((npy_int *)data0)[0];
        accum_im += ((npy_int *)data0)[1];
        data0 += stride0;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1 <= 3
    ((npy_int *)dataptr[1])[0] += accum_re;
    ((npy_int *)dataptr[1])[1] += accum_im;
#  else
    ((npy_int *)dataptr[nop])[0] += accum_re;
    ((npy_int *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1 <= 3
    *((npy_int *)dataptr[1]) = (accum +
                                    (*((npy_int *)dataptr[1])));
#  else
    *((npy_int *)dataptr[nop]) = (accum +
                                    (*((npy_int *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
int_sum_of_products_two(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (2 == 1) || (2 <= 3 && !0)
    char *data_out = dataptr[2];
    npy_intp stride_out = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_two (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        *(npy_int *)data_out = ((*(npy_int *)data0) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 2 == 2
        *(npy_int *)data_out = ((*(npy_int *)data0) *
                                         (*(npy_int *)data1) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 2 == 3
        *(npy_int *)data_out = ((*(npy_int *)data0) *
                                         (*(npy_int *)data1) *
                                         (*(npy_int *)data2) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        *(npy_int *)dataptr[nop] = (temp +
                                           (*(npy_int *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        ((npy_int *)data_out)[0] = ((npy_int *)data0)[0] +
                                         ((npy_int *)data_out)[0];
        ((npy_int *)data_out)[1] = ((npy_int *)data0)[1] +
                                         ((npy_int *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 2 == 1

static void
int_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data_out = (npy_int *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_int *)data_out + 2*6)[0] =
                                    ((npy_int *)data0 + 2*6)[0] +
                                    ((npy_int *)data_out + 2*6)[0];
            ((npy_int *)data_out + 2*6)[1] =
                                    ((npy_int *)data0 + 2*6)[1] +
                                    ((npy_int *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_int *)data_out + 2*5)[0] =
                                    ((npy_int *)data0 + 2*5)[0] +
                                    ((npy_int *)data_out + 2*5)[0];
            ((npy_int *)data_out + 2*5)[1] =
                                    ((npy_int *)data0 + 2*5)[1] +
                                    ((npy_int *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_int *)data_out + 2*4)[0] =
                                    ((npy_int *)data0 + 2*4)[0] +
                                    ((npy_int *)data_out + 2*4)[0];
            ((npy_int *)data_out + 2*4)[1] =
                                    ((npy_int *)data0 + 2*4)[1] +
                                    ((npy_int *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_int *)data_out + 2*3)[0] =
                                    ((npy_int *)data0 + 2*3)[0] +
                                    ((npy_int *)data_out + 2*3)[0];
            ((npy_int *)data_out + 2*3)[1] =
                                    ((npy_int *)data0 + 2*3)[1] +
                                    ((npy_int *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_int *)data_out + 2*2)[0] =
                                    ((npy_int *)data0 + 2*2)[0] +
                                    ((npy_int *)data_out + 2*2)[0];
            ((npy_int *)data_out + 2*2)[1] =
                                    ((npy_int *)data0 + 2*2)[1] +
                                    ((npy_int *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_int *)data_out + 2*1)[0] =
                                    ((npy_int *)data0 + 2*1)[0] +
                                    ((npy_int *)data_out + 2*1)[0];
            ((npy_int *)data_out + 2*1)[1] =
                                    ((npy_int *)data0 + 2*1)[1] +
                                    ((npy_int *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_int *)data_out + 2*0)[0] =
                                    ((npy_int *)data0 + 2*0)[0] +
                                    ((npy_int *)data_out + 2*0)[0];
            ((npy_int *)data_out + 2*0)[1] =
                                    ((npy_int *)data0 + 2*0)[1] +
                                    ((npy_int *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_int *)data_out + 2*0)[0] =
                                ((npy_int *)data0 + 2*0)[0] +
                                ((npy_int *)data_out + 2*0)[0];
        ((npy_int *)data_out + 2*0)[1] =
                                ((npy_int *)data0 + 2*0)[1] +
                                ((npy_int *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_int *)data_out + 2*1)[0] =
                                ((npy_int *)data0 + 2*1)[0] +
                                ((npy_int *)data_out + 2*1)[0];
        ((npy_int *)data_out + 2*1)[1] =
                                ((npy_int *)data0 + 2*1)[1] +
                                ((npy_int *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_int *)data_out + 2*2)[0] =
                                ((npy_int *)data0 + 2*2)[0] +
                                ((npy_int *)data_out + 2*2)[0];
        ((npy_int *)data_out + 2*2)[1] =
                                ((npy_int *)data0 + 2*2)[1] +
                                ((npy_int *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_int *)data_out + 2*3)[0] =
                                ((npy_int *)data0 + 2*3)[0] +
                                ((npy_int *)data_out + 2*3)[0];
        ((npy_int *)data_out + 2*3)[1] =
                                ((npy_int *)data0 + 2*3)[1] +
                                ((npy_int *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_int *)data_out + 2*4)[0] =
                                ((npy_int *)data0 + 2*4)[0] +
                                ((npy_int *)data_out + 2*4)[0];
        ((npy_int *)data_out + 2*4)[1] =
                                ((npy_int *)data0 + 2*4)[1] +
                                ((npy_int *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_int *)data_out + 2*5)[0] =
                                ((npy_int *)data0 + 2*5)[0] +
                                ((npy_int *)data_out + 2*5)[0];
        ((npy_int *)data_out + 2*5)[1] =
                                ((npy_int *)data0 + 2*5)[1] +
                                ((npy_int *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_int *)data_out + 2*6)[0] =
                                ((npy_int *)data0 + 2*6)[0] +
                                ((npy_int *)data_out + 2*6)[0];
        ((npy_int *)data_out + 2*6)[1] =
                                ((npy_int *)data0 + 2*6)[1] +
                                ((npy_int *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_int *)data_out + 2*7)[0] =
                                ((npy_int *)data0 + 2*7)[0] +
                                ((npy_int *)data_out + 2*7)[0];
        ((npy_int *)data_out + 2*7)[1] =
                                ((npy_int *)data0 + 2*7)[1] +
                                ((npy_int *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 2 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
int_sum_of_products_muladd(npy_int *data, npy_int *data_out, npy_int scalar, npy_intp count)
{
#if 0 // NPYV check for npy_int
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s32;
    const npyv_s32 v_scalar = npyv_setall_s32(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s32 b0 = npyv_loada_s32(data + vstep * 0);
            npyv_s32 c0 = npyv_loada_s32(data_out + vstep * 0);
            
#line 312
            npyv_s32 b1 = npyv_loada_s32(data + vstep * 1);
            npyv_s32 c1 = npyv_loada_s32(data_out + vstep * 1);
            
#line 312
            npyv_s32 b2 = npyv_loada_s32(data + vstep * 2);
            npyv_s32 c2 = npyv_loada_s32(data_out + vstep * 2);
            
#line 312
            npyv_s32 b3 = npyv_loada_s32(data + vstep * 3);
            npyv_s32 c3 = npyv_loada_s32(data_out + vstep * 3);
            
            #line 318
            npyv_s32 abc0 = npyv_muladd_s32(v_scalar, b0, c0);
            
#line 318
            npyv_s32 abc1 = npyv_muladd_s32(v_scalar, b1, c1);
            
#line 318
            npyv_s32 abc2 = npyv_muladd_s32(v_scalar, b2, c2);
            
#line 318
            npyv_s32 abc3 = npyv_muladd_s32(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_s32(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_s32(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_s32(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_s32(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s32 b0 = npyv_load_s32(data + vstep * 0);
            npyv_s32 c0 = npyv_load_s32(data_out + vstep * 0);
            
#line 312
            npyv_s32 b1 = npyv_load_s32(data + vstep * 1);
            npyv_s32 c1 = npyv_load_s32(data_out + vstep * 1);
            
#line 312
            npyv_s32 b2 = npyv_load_s32(data + vstep * 2);
            npyv_s32 c2 = npyv_load_s32(data_out + vstep * 2);
            
#line 312
            npyv_s32 b3 = npyv_load_s32(data + vstep * 3);
            npyv_s32 c3 = npyv_load_s32(data_out + vstep * 3);
            
            #line 318
            npyv_s32 abc0 = npyv_muladd_s32(v_scalar, b0, c0);
            
#line 318
            npyv_s32 abc1 = npyv_muladd_s32(v_scalar, b1, c1);
            
#line 318
            npyv_s32 abc2 = npyv_muladd_s32(v_scalar, b2, c2);
            
#line 318
            npyv_s32 abc3 = npyv_muladd_s32(v_scalar, b3, c3);
            
            #line 323
            npyv_store_s32(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_s32(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_s32(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_s32(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_s32 a = npyv_load_tillz_s32(data, count);
        npyv_s32 b = npyv_load_tillz_s32(data_out, count);
        npyv_store_till_s32(data_out, count, npyv_muladd_s32(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_int b0 = (data[0]);
        const npy_int c0 = (data_out[0]);
        
#line 340
        const npy_int b1 = (data[1]);
        const npy_int c1 = (data_out[1]);
        
#line 340
        const npy_int b2 = (data[2]);
        const npy_int c2 = (data_out[2]);
        
#line 340
        const npy_int b3 = (data[3]);
        const npy_int c3 = (data_out[3]);
        
        #line 346
        const npy_int abc0 = scalar * b0 + c0;
        
#line 346
        const npy_int abc1 = scalar * b1 + c1;
        
#line 346
        const npy_int abc2 = scalar * b2 + c2;
        
#line 346
        const npy_int abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_int b = (*data);
        const npy_int c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_int
}

static void
int_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data_out = (npy_int *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_int
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s32;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s32 a0 = npyv_loada_s32(data0 + vstep * 0);
            npyv_s32 b0 = npyv_loada_s32(data1 + vstep * 0);
            npyv_s32 c0 = npyv_loada_s32(data_out + vstep * 0);
            
#line 390
            npyv_s32 a1 = npyv_loada_s32(data0 + vstep * 1);
            npyv_s32 b1 = npyv_loada_s32(data1 + vstep * 1);
            npyv_s32 c1 = npyv_loada_s32(data_out + vstep * 1);
            
#line 390
            npyv_s32 a2 = npyv_loada_s32(data0 + vstep * 2);
            npyv_s32 b2 = npyv_loada_s32(data1 + vstep * 2);
            npyv_s32 c2 = npyv_loada_s32(data_out + vstep * 2);
            
#line 390
            npyv_s32 a3 = npyv_loada_s32(data0 + vstep * 3);
            npyv_s32 b3 = npyv_loada_s32(data1 + vstep * 3);
            npyv_s32 c3 = npyv_loada_s32(data_out + vstep * 3);
            
            #line 397
            npyv_s32 abc0 = npyv_muladd_s32(a0, b0, c0);
            
#line 397
            npyv_s32 abc1 = npyv_muladd_s32(a1, b1, c1);
            
#line 397
            npyv_s32 abc2 = npyv_muladd_s32(a2, b2, c2);
            
#line 397
            npyv_s32 abc3 = npyv_muladd_s32(a3, b3, c3);
            
            #line 402
            npyv_storea_s32(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_s32(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_s32(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_s32(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s32 a0 = npyv_load_s32(data0 + vstep * 0);
            npyv_s32 b0 = npyv_load_s32(data1 + vstep * 0);
            npyv_s32 c0 = npyv_load_s32(data_out + vstep * 0);
            
#line 390
            npyv_s32 a1 = npyv_load_s32(data0 + vstep * 1);
            npyv_s32 b1 = npyv_load_s32(data1 + vstep * 1);
            npyv_s32 c1 = npyv_load_s32(data_out + vstep * 1);
            
#line 390
            npyv_s32 a2 = npyv_load_s32(data0 + vstep * 2);
            npyv_s32 b2 = npyv_load_s32(data1 + vstep * 2);
            npyv_s32 c2 = npyv_load_s32(data_out + vstep * 2);
            
#line 390
            npyv_s32 a3 = npyv_load_s32(data0 + vstep * 3);
            npyv_s32 b3 = npyv_load_s32(data1 + vstep * 3);
            npyv_s32 c3 = npyv_load_s32(data_out + vstep * 3);
            
            #line 397
            npyv_s32 abc0 = npyv_muladd_s32(a0, b0, c0);
            
#line 397
            npyv_s32 abc1 = npyv_muladd_s32(a1, b1, c1);
            
#line 397
            npyv_s32 abc2 = npyv_muladd_s32(a2, b2, c2);
            
#line 397
            npyv_s32 abc3 = npyv_muladd_s32(a3, b3, c3);
            
            #line 402
            npyv_store_s32(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_s32(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_s32(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_s32(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_s32 a = npyv_load_tillz_s32(data0, count);
        npyv_s32 b = npyv_load_tillz_s32(data1, count);
        npyv_s32 c = npyv_load_tillz_s32(data_out, count);
        npyv_store_till_s32(data_out, count, npyv_muladd_s32(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_int a0 = (data0[0]);
        const npy_int b0 = (data1[0]);
        const npy_int c0 = (data_out[0]);
        
#line 420
        const npy_int a1 = (data0[1]);
        const npy_int b1 = (data1[1]);
        const npy_int c1 = (data_out[1]);
        
#line 420
        const npy_int a2 = (data0[2]);
        const npy_int b2 = (data1[2]);
        const npy_int c2 = (data_out[2]);
        
#line 420
        const npy_int a3 = (data0[3]);
        const npy_int b3 = (data1[3]);
        const npy_int c3 = (data_out[3]);
        
        #line 427
        const npy_int abc0 = a0 * b0 + c0;
        
#line 427
        const npy_int abc1 = a1 * b1 + c1;
        
#line 427
        const npy_int abc2 = a2 * b2 + c2;
        
#line 427
        const npy_int abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_int a = (*data0);
        const npy_int b = (*data1);
        const npy_int c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_int

}

/* Some extra specializations for the two operand case */
static void
int_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int value0 = (*(npy_int *)dataptr[0]);
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data_out = (npy_int *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    int_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
int_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int value1 = (*(npy_int *)dataptr[1]);
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data_out = (npy_int *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    int_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
int_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int accum = 0;

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_int
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_s32;
    npyv_s32 vaccum = npyv_zero_s32();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s32 a0 = npyv_loada_s32(data0 + vstep * 0);
            npyv_s32 b0 = npyv_loada_s32(data1 + vstep * 0);
            
#line 501
            npyv_s32 a1 = npyv_loada_s32(data0 + vstep * 1);
            npyv_s32 b1 = npyv_loada_s32(data1 + vstep * 1);
            
#line 501
            npyv_s32 a2 = npyv_loada_s32(data0 + vstep * 2);
            npyv_s32 b2 = npyv_loada_s32(data1 + vstep * 2);
            
#line 501
            npyv_s32 a3 = npyv_loada_s32(data0 + vstep * 3);
            npyv_s32 b3 = npyv_loada_s32(data1 + vstep * 3);
            
            npyv_s32 ab3 = npyv_muladd_s32(a3, b3, vaccum);
            npyv_s32 ab2 = npyv_muladd_s32(a2, b2, ab3);
            npyv_s32 ab1 = npyv_muladd_s32(a1, b1, ab2);
                    vaccum = npyv_muladd_s32(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s32 a0 = npyv_load_s32(data0 + vstep * 0);
            npyv_s32 b0 = npyv_load_s32(data1 + vstep * 0);
            
#line 501
            npyv_s32 a1 = npyv_load_s32(data0 + vstep * 1);
            npyv_s32 b1 = npyv_load_s32(data1 + vstep * 1);
            
#line 501
            npyv_s32 a2 = npyv_load_s32(data0 + vstep * 2);
            npyv_s32 b2 = npyv_load_s32(data1 + vstep * 2);
            
#line 501
            npyv_s32 a3 = npyv_load_s32(data0 + vstep * 3);
            npyv_s32 b3 = npyv_load_s32(data1 + vstep * 3);
            
            npyv_s32 ab3 = npyv_muladd_s32(a3, b3, vaccum);
            npyv_s32 ab2 = npyv_muladd_s32(a2, b2, ab3);
            npyv_s32 ab1 = npyv_muladd_s32(a1, b1, ab2);
                    vaccum = npyv_muladd_s32(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_s32 a = npyv_load_tillz_s32(data0, count);
        npyv_s32 b = npyv_load_tillz_s32(data1, count);
        vaccum = npyv_muladd_s32(a, b, vaccum);
    }
    accum = npyv_sum_s32(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_int ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_int ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_int ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_int ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_int a = (*data0);
        const npy_int b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_int
    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
int_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int value0 = (*(npy_int *)dataptr[0]);
    npy_int accum = int_sum_of_arr(data1, count);
    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
int_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int value1 = (*(npy_int *)dataptr[1]);
    npy_int accum = int_sum_of_arr(data0, count);
    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + value1 * accum);
}

#elif 2 == 3 && !0

static void
int_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data2 = (npy_int *)dataptr[2];
    npy_int *data_out = (npy_int *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 2 > 3 || @complex */

static void
int_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        *(npy_int *)dataptr[nop] = (temp +
                                           (*(npy_int *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_int);
        }
#else /* complex */
#  if 2 <= 3
#    define _SUMPROD_NOP 2
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_int);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 2 */

#if 2 == 1

static NPY_GCC_OPT_3 void
int_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_int *data = (npy_int *)dataptr[0];
    npy_int accum = int_sum_of_arr(data, count);
    *((npy_int *)dataptr[1]) = (accum + (*((npy_int *)dataptr[1])));
#else
    npy_int accum_re = 0, accum_im = 0;
    npy_int *data0 = (npy_int *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_int re01 = data0[0] + data0[2];
        const npy_int re23 = data0[4] + data0[6];
        const npy_int im13 = data0[1] + data0[3];
        const npy_int im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_int *)dataptr[1])[0] += accum_re;
    ((npy_int *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 2 == 1 */

static void
int_sum_of_products_outstride0_two(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_int accum_re = 0, accum_im = 0;
#else
    npy_int accum = 0;
#endif

#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_outstride0_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        accum += (*(npy_int *)data0);
        data0 += stride0;
#  elif 2 == 2
        accum += (*(npy_int *)data0) *
                 (*(npy_int *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 2 == 3
        accum += (*(npy_int *)data0) *
                 (*(npy_int *)data1) *
                 (*(npy_int *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        accum_re += ((npy_int *)data0)[0];
        accum_im += ((npy_int *)data0)[1];
        data0 += stride0;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 2 <= 3
    ((npy_int *)dataptr[2])[0] += accum_re;
    ((npy_int *)dataptr[2])[1] += accum_im;
#  else
    ((npy_int *)dataptr[nop])[0] += accum_re;
    ((npy_int *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 2 <= 3
    *((npy_int *)dataptr[2]) = (accum +
                                    (*((npy_int *)dataptr[2])));
#  else
    *((npy_int *)dataptr[nop]) = (accum +
                                    (*((npy_int *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
int_sum_of_products_three(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (3 == 1) || (3 <= 3 && !0)
    char *data_out = dataptr[3];
    npy_intp stride_out = strides[3];
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_three (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        *(npy_int *)data_out = ((*(npy_int *)data0) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 3 == 2
        *(npy_int *)data_out = ((*(npy_int *)data0) *
                                         (*(npy_int *)data1) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 3 == 3
        *(npy_int *)data_out = ((*(npy_int *)data0) *
                                         (*(npy_int *)data1) *
                                         (*(npy_int *)data2) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        *(npy_int *)dataptr[nop] = (temp +
                                           (*(npy_int *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        ((npy_int *)data_out)[0] = ((npy_int *)data0)[0] +
                                         ((npy_int *)data_out)[0];
        ((npy_int *)data_out)[1] = ((npy_int *)data0)[1] +
                                         ((npy_int *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 3 == 1

static void
int_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data_out = (npy_int *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_int *)data_out + 2*6)[0] =
                                    ((npy_int *)data0 + 2*6)[0] +
                                    ((npy_int *)data_out + 2*6)[0];
            ((npy_int *)data_out + 2*6)[1] =
                                    ((npy_int *)data0 + 2*6)[1] +
                                    ((npy_int *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_int *)data_out + 2*5)[0] =
                                    ((npy_int *)data0 + 2*5)[0] +
                                    ((npy_int *)data_out + 2*5)[0];
            ((npy_int *)data_out + 2*5)[1] =
                                    ((npy_int *)data0 + 2*5)[1] +
                                    ((npy_int *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_int *)data_out + 2*4)[0] =
                                    ((npy_int *)data0 + 2*4)[0] +
                                    ((npy_int *)data_out + 2*4)[0];
            ((npy_int *)data_out + 2*4)[1] =
                                    ((npy_int *)data0 + 2*4)[1] +
                                    ((npy_int *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_int *)data_out + 2*3)[0] =
                                    ((npy_int *)data0 + 2*3)[0] +
                                    ((npy_int *)data_out + 2*3)[0];
            ((npy_int *)data_out + 2*3)[1] =
                                    ((npy_int *)data0 + 2*3)[1] +
                                    ((npy_int *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_int *)data_out + 2*2)[0] =
                                    ((npy_int *)data0 + 2*2)[0] +
                                    ((npy_int *)data_out + 2*2)[0];
            ((npy_int *)data_out + 2*2)[1] =
                                    ((npy_int *)data0 + 2*2)[1] +
                                    ((npy_int *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_int *)data_out + 2*1)[0] =
                                    ((npy_int *)data0 + 2*1)[0] +
                                    ((npy_int *)data_out + 2*1)[0];
            ((npy_int *)data_out + 2*1)[1] =
                                    ((npy_int *)data0 + 2*1)[1] +
                                    ((npy_int *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_int *)data_out + 2*0)[0] =
                                    ((npy_int *)data0 + 2*0)[0] +
                                    ((npy_int *)data_out + 2*0)[0];
            ((npy_int *)data_out + 2*0)[1] =
                                    ((npy_int *)data0 + 2*0)[1] +
                                    ((npy_int *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_int *)data_out + 2*0)[0] =
                                ((npy_int *)data0 + 2*0)[0] +
                                ((npy_int *)data_out + 2*0)[0];
        ((npy_int *)data_out + 2*0)[1] =
                                ((npy_int *)data0 + 2*0)[1] +
                                ((npy_int *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_int *)data_out + 2*1)[0] =
                                ((npy_int *)data0 + 2*1)[0] +
                                ((npy_int *)data_out + 2*1)[0];
        ((npy_int *)data_out + 2*1)[1] =
                                ((npy_int *)data0 + 2*1)[1] +
                                ((npy_int *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_int *)data_out + 2*2)[0] =
                                ((npy_int *)data0 + 2*2)[0] +
                                ((npy_int *)data_out + 2*2)[0];
        ((npy_int *)data_out + 2*2)[1] =
                                ((npy_int *)data0 + 2*2)[1] +
                                ((npy_int *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_int *)data_out + 2*3)[0] =
                                ((npy_int *)data0 + 2*3)[0] +
                                ((npy_int *)data_out + 2*3)[0];
        ((npy_int *)data_out + 2*3)[1] =
                                ((npy_int *)data0 + 2*3)[1] +
                                ((npy_int *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_int *)data_out + 2*4)[0] =
                                ((npy_int *)data0 + 2*4)[0] +
                                ((npy_int *)data_out + 2*4)[0];
        ((npy_int *)data_out + 2*4)[1] =
                                ((npy_int *)data0 + 2*4)[1] +
                                ((npy_int *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_int *)data_out + 2*5)[0] =
                                ((npy_int *)data0 + 2*5)[0] +
                                ((npy_int *)data_out + 2*5)[0];
        ((npy_int *)data_out + 2*5)[1] =
                                ((npy_int *)data0 + 2*5)[1] +
                                ((npy_int *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_int *)data_out + 2*6)[0] =
                                ((npy_int *)data0 + 2*6)[0] +
                                ((npy_int *)data_out + 2*6)[0];
        ((npy_int *)data_out + 2*6)[1] =
                                ((npy_int *)data0 + 2*6)[1] +
                                ((npy_int *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_int *)data_out + 2*7)[0] =
                                ((npy_int *)data0 + 2*7)[0] +
                                ((npy_int *)data_out + 2*7)[0];
        ((npy_int *)data_out + 2*7)[1] =
                                ((npy_int *)data0 + 2*7)[1] +
                                ((npy_int *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 3 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
int_sum_of_products_muladd(npy_int *data, npy_int *data_out, npy_int scalar, npy_intp count)
{
#if 0 // NPYV check for npy_int
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s32;
    const npyv_s32 v_scalar = npyv_setall_s32(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s32 b0 = npyv_loada_s32(data + vstep * 0);
            npyv_s32 c0 = npyv_loada_s32(data_out + vstep * 0);
            
#line 312
            npyv_s32 b1 = npyv_loada_s32(data + vstep * 1);
            npyv_s32 c1 = npyv_loada_s32(data_out + vstep * 1);
            
#line 312
            npyv_s32 b2 = npyv_loada_s32(data + vstep * 2);
            npyv_s32 c2 = npyv_loada_s32(data_out + vstep * 2);
            
#line 312
            npyv_s32 b3 = npyv_loada_s32(data + vstep * 3);
            npyv_s32 c3 = npyv_loada_s32(data_out + vstep * 3);
            
            #line 318
            npyv_s32 abc0 = npyv_muladd_s32(v_scalar, b0, c0);
            
#line 318
            npyv_s32 abc1 = npyv_muladd_s32(v_scalar, b1, c1);
            
#line 318
            npyv_s32 abc2 = npyv_muladd_s32(v_scalar, b2, c2);
            
#line 318
            npyv_s32 abc3 = npyv_muladd_s32(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_s32(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_s32(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_s32(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_s32(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s32 b0 = npyv_load_s32(data + vstep * 0);
            npyv_s32 c0 = npyv_load_s32(data_out + vstep * 0);
            
#line 312
            npyv_s32 b1 = npyv_load_s32(data + vstep * 1);
            npyv_s32 c1 = npyv_load_s32(data_out + vstep * 1);
            
#line 312
            npyv_s32 b2 = npyv_load_s32(data + vstep * 2);
            npyv_s32 c2 = npyv_load_s32(data_out + vstep * 2);
            
#line 312
            npyv_s32 b3 = npyv_load_s32(data + vstep * 3);
            npyv_s32 c3 = npyv_load_s32(data_out + vstep * 3);
            
            #line 318
            npyv_s32 abc0 = npyv_muladd_s32(v_scalar, b0, c0);
            
#line 318
            npyv_s32 abc1 = npyv_muladd_s32(v_scalar, b1, c1);
            
#line 318
            npyv_s32 abc2 = npyv_muladd_s32(v_scalar, b2, c2);
            
#line 318
            npyv_s32 abc3 = npyv_muladd_s32(v_scalar, b3, c3);
            
            #line 323
            npyv_store_s32(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_s32(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_s32(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_s32(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_s32 a = npyv_load_tillz_s32(data, count);
        npyv_s32 b = npyv_load_tillz_s32(data_out, count);
        npyv_store_till_s32(data_out, count, npyv_muladd_s32(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_int b0 = (data[0]);
        const npy_int c0 = (data_out[0]);
        
#line 340
        const npy_int b1 = (data[1]);
        const npy_int c1 = (data_out[1]);
        
#line 340
        const npy_int b2 = (data[2]);
        const npy_int c2 = (data_out[2]);
        
#line 340
        const npy_int b3 = (data[3]);
        const npy_int c3 = (data_out[3]);
        
        #line 346
        const npy_int abc0 = scalar * b0 + c0;
        
#line 346
        const npy_int abc1 = scalar * b1 + c1;
        
#line 346
        const npy_int abc2 = scalar * b2 + c2;
        
#line 346
        const npy_int abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_int b = (*data);
        const npy_int c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_int
}

static void
int_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data_out = (npy_int *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_int
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s32;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s32 a0 = npyv_loada_s32(data0 + vstep * 0);
            npyv_s32 b0 = npyv_loada_s32(data1 + vstep * 0);
            npyv_s32 c0 = npyv_loada_s32(data_out + vstep * 0);
            
#line 390
            npyv_s32 a1 = npyv_loada_s32(data0 + vstep * 1);
            npyv_s32 b1 = npyv_loada_s32(data1 + vstep * 1);
            npyv_s32 c1 = npyv_loada_s32(data_out + vstep * 1);
            
#line 390
            npyv_s32 a2 = npyv_loada_s32(data0 + vstep * 2);
            npyv_s32 b2 = npyv_loada_s32(data1 + vstep * 2);
            npyv_s32 c2 = npyv_loada_s32(data_out + vstep * 2);
            
#line 390
            npyv_s32 a3 = npyv_loada_s32(data0 + vstep * 3);
            npyv_s32 b3 = npyv_loada_s32(data1 + vstep * 3);
            npyv_s32 c3 = npyv_loada_s32(data_out + vstep * 3);
            
            #line 397
            npyv_s32 abc0 = npyv_muladd_s32(a0, b0, c0);
            
#line 397
            npyv_s32 abc1 = npyv_muladd_s32(a1, b1, c1);
            
#line 397
            npyv_s32 abc2 = npyv_muladd_s32(a2, b2, c2);
            
#line 397
            npyv_s32 abc3 = npyv_muladd_s32(a3, b3, c3);
            
            #line 402
            npyv_storea_s32(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_s32(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_s32(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_s32(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s32 a0 = npyv_load_s32(data0 + vstep * 0);
            npyv_s32 b0 = npyv_load_s32(data1 + vstep * 0);
            npyv_s32 c0 = npyv_load_s32(data_out + vstep * 0);
            
#line 390
            npyv_s32 a1 = npyv_load_s32(data0 + vstep * 1);
            npyv_s32 b1 = npyv_load_s32(data1 + vstep * 1);
            npyv_s32 c1 = npyv_load_s32(data_out + vstep * 1);
            
#line 390
            npyv_s32 a2 = npyv_load_s32(data0 + vstep * 2);
            npyv_s32 b2 = npyv_load_s32(data1 + vstep * 2);
            npyv_s32 c2 = npyv_load_s32(data_out + vstep * 2);
            
#line 390
            npyv_s32 a3 = npyv_load_s32(data0 + vstep * 3);
            npyv_s32 b3 = npyv_load_s32(data1 + vstep * 3);
            npyv_s32 c3 = npyv_load_s32(data_out + vstep * 3);
            
            #line 397
            npyv_s32 abc0 = npyv_muladd_s32(a0, b0, c0);
            
#line 397
            npyv_s32 abc1 = npyv_muladd_s32(a1, b1, c1);
            
#line 397
            npyv_s32 abc2 = npyv_muladd_s32(a2, b2, c2);
            
#line 397
            npyv_s32 abc3 = npyv_muladd_s32(a3, b3, c3);
            
            #line 402
            npyv_store_s32(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_s32(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_s32(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_s32(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_s32 a = npyv_load_tillz_s32(data0, count);
        npyv_s32 b = npyv_load_tillz_s32(data1, count);
        npyv_s32 c = npyv_load_tillz_s32(data_out, count);
        npyv_store_till_s32(data_out, count, npyv_muladd_s32(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_int a0 = (data0[0]);
        const npy_int b0 = (data1[0]);
        const npy_int c0 = (data_out[0]);
        
#line 420
        const npy_int a1 = (data0[1]);
        const npy_int b1 = (data1[1]);
        const npy_int c1 = (data_out[1]);
        
#line 420
        const npy_int a2 = (data0[2]);
        const npy_int b2 = (data1[2]);
        const npy_int c2 = (data_out[2]);
        
#line 420
        const npy_int a3 = (data0[3]);
        const npy_int b3 = (data1[3]);
        const npy_int c3 = (data_out[3]);
        
        #line 427
        const npy_int abc0 = a0 * b0 + c0;
        
#line 427
        const npy_int abc1 = a1 * b1 + c1;
        
#line 427
        const npy_int abc2 = a2 * b2 + c2;
        
#line 427
        const npy_int abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_int a = (*data0);
        const npy_int b = (*data1);
        const npy_int c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_int

}

/* Some extra specializations for the two operand case */
static void
int_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int value0 = (*(npy_int *)dataptr[0]);
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data_out = (npy_int *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    int_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
int_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int value1 = (*(npy_int *)dataptr[1]);
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data_out = (npy_int *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    int_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
int_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int accum = 0;

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_int
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_s32;
    npyv_s32 vaccum = npyv_zero_s32();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s32 a0 = npyv_loada_s32(data0 + vstep * 0);
            npyv_s32 b0 = npyv_loada_s32(data1 + vstep * 0);
            
#line 501
            npyv_s32 a1 = npyv_loada_s32(data0 + vstep * 1);
            npyv_s32 b1 = npyv_loada_s32(data1 + vstep * 1);
            
#line 501
            npyv_s32 a2 = npyv_loada_s32(data0 + vstep * 2);
            npyv_s32 b2 = npyv_loada_s32(data1 + vstep * 2);
            
#line 501
            npyv_s32 a3 = npyv_loada_s32(data0 + vstep * 3);
            npyv_s32 b3 = npyv_loada_s32(data1 + vstep * 3);
            
            npyv_s32 ab3 = npyv_muladd_s32(a3, b3, vaccum);
            npyv_s32 ab2 = npyv_muladd_s32(a2, b2, ab3);
            npyv_s32 ab1 = npyv_muladd_s32(a1, b1, ab2);
                    vaccum = npyv_muladd_s32(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s32 a0 = npyv_load_s32(data0 + vstep * 0);
            npyv_s32 b0 = npyv_load_s32(data1 + vstep * 0);
            
#line 501
            npyv_s32 a1 = npyv_load_s32(data0 + vstep * 1);
            npyv_s32 b1 = npyv_load_s32(data1 + vstep * 1);
            
#line 501
            npyv_s32 a2 = npyv_load_s32(data0 + vstep * 2);
            npyv_s32 b2 = npyv_load_s32(data1 + vstep * 2);
            
#line 501
            npyv_s32 a3 = npyv_load_s32(data0 + vstep * 3);
            npyv_s32 b3 = npyv_load_s32(data1 + vstep * 3);
            
            npyv_s32 ab3 = npyv_muladd_s32(a3, b3, vaccum);
            npyv_s32 ab2 = npyv_muladd_s32(a2, b2, ab3);
            npyv_s32 ab1 = npyv_muladd_s32(a1, b1, ab2);
                    vaccum = npyv_muladd_s32(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_s32 a = npyv_load_tillz_s32(data0, count);
        npyv_s32 b = npyv_load_tillz_s32(data1, count);
        vaccum = npyv_muladd_s32(a, b, vaccum);
    }
    accum = npyv_sum_s32(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_int ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_int ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_int ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_int ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_int a = (*data0);
        const npy_int b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_int
    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
int_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int value0 = (*(npy_int *)dataptr[0]);
    npy_int accum = int_sum_of_arr(data1, count);
    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
int_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int value1 = (*(npy_int *)dataptr[1]);
    npy_int accum = int_sum_of_arr(data0, count);
    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + value1 * accum);
}

#elif 3 == 3 && !0

static void
int_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data2 = (npy_int *)dataptr[2];
    npy_int *data_out = (npy_int *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 3 > 3 || @complex */

static void
int_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        *(npy_int *)dataptr[nop] = (temp +
                                           (*(npy_int *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_int);
        }
#else /* complex */
#  if 3 <= 3
#    define _SUMPROD_NOP 3
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_int);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 3 */

#if 3 == 1

static NPY_GCC_OPT_3 void
int_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_int *data = (npy_int *)dataptr[0];
    npy_int accum = int_sum_of_arr(data, count);
    *((npy_int *)dataptr[1]) = (accum + (*((npy_int *)dataptr[1])));
#else
    npy_int accum_re = 0, accum_im = 0;
    npy_int *data0 = (npy_int *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_int re01 = data0[0] + data0[2];
        const npy_int re23 = data0[4] + data0[6];
        const npy_int im13 = data0[1] + data0[3];
        const npy_int im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_int *)dataptr[1])[0] += accum_re;
    ((npy_int *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 3 == 1 */

static void
int_sum_of_products_outstride0_three(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_int accum_re = 0, accum_im = 0;
#else
    npy_int accum = 0;
#endif

#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_outstride0_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        accum += (*(npy_int *)data0);
        data0 += stride0;
#  elif 3 == 2
        accum += (*(npy_int *)data0) *
                 (*(npy_int *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 3 == 3
        accum += (*(npy_int *)data0) *
                 (*(npy_int *)data1) *
                 (*(npy_int *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        accum_re += ((npy_int *)data0)[0];
        accum_im += ((npy_int *)data0)[1];
        data0 += stride0;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 3 <= 3
    ((npy_int *)dataptr[3])[0] += accum_re;
    ((npy_int *)dataptr[3])[1] += accum_im;
#  else
    ((npy_int *)dataptr[nop])[0] += accum_re;
    ((npy_int *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 3 <= 3
    *((npy_int *)dataptr[3]) = (accum +
                                    (*((npy_int *)dataptr[3])));
#  else
    *((npy_int *)dataptr[nop]) = (accum +
                                    (*((npy_int *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
int_sum_of_products_any(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data_out = dataptr[1000];
    npy_intp stride_out = strides[1000];
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_any (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        *(npy_int *)data_out = ((*(npy_int *)data0) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1000 == 2
        *(npy_int *)data_out = ((*(npy_int *)data0) *
                                         (*(npy_int *)data1) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1000 == 3
        *(npy_int *)data_out = ((*(npy_int *)data0) *
                                         (*(npy_int *)data1) *
                                         (*(npy_int *)data2) +
                                         (*(npy_int *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        *(npy_int *)dataptr[nop] = (temp +
                                           (*(npy_int *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        ((npy_int *)data_out)[0] = ((npy_int *)data0)[0] +
                                         ((npy_int *)data_out)[0];
        ((npy_int *)data_out)[1] = ((npy_int *)data0)[1] +
                                         ((npy_int *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1000 == 1

static void
int_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data_out = (npy_int *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_int *)data_out + 2*6)[0] =
                                    ((npy_int *)data0 + 2*6)[0] +
                                    ((npy_int *)data_out + 2*6)[0];
            ((npy_int *)data_out + 2*6)[1] =
                                    ((npy_int *)data0 + 2*6)[1] +
                                    ((npy_int *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_int *)data_out + 2*5)[0] =
                                    ((npy_int *)data0 + 2*5)[0] +
                                    ((npy_int *)data_out + 2*5)[0];
            ((npy_int *)data_out + 2*5)[1] =
                                    ((npy_int *)data0 + 2*5)[1] +
                                    ((npy_int *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_int *)data_out + 2*4)[0] =
                                    ((npy_int *)data0 + 2*4)[0] +
                                    ((npy_int *)data_out + 2*4)[0];
            ((npy_int *)data_out + 2*4)[1] =
                                    ((npy_int *)data0 + 2*4)[1] +
                                    ((npy_int *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_int *)data_out + 2*3)[0] =
                                    ((npy_int *)data0 + 2*3)[0] +
                                    ((npy_int *)data_out + 2*3)[0];
            ((npy_int *)data_out + 2*3)[1] =
                                    ((npy_int *)data0 + 2*3)[1] +
                                    ((npy_int *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_int *)data_out + 2*2)[0] =
                                    ((npy_int *)data0 + 2*2)[0] +
                                    ((npy_int *)data_out + 2*2)[0];
            ((npy_int *)data_out + 2*2)[1] =
                                    ((npy_int *)data0 + 2*2)[1] +
                                    ((npy_int *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_int *)data_out + 2*1)[0] =
                                    ((npy_int *)data0 + 2*1)[0] +
                                    ((npy_int *)data_out + 2*1)[0];
            ((npy_int *)data_out + 2*1)[1] =
                                    ((npy_int *)data0 + 2*1)[1] +
                                    ((npy_int *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_int *)data_out + 2*0)[0] =
                                    ((npy_int *)data0 + 2*0)[0] +
                                    ((npy_int *)data_out + 2*0)[0];
            ((npy_int *)data_out + 2*0)[1] =
                                    ((npy_int *)data0 + 2*0)[1] +
                                    ((npy_int *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_int *)data_out + 2*0)[0] =
                                ((npy_int *)data0 + 2*0)[0] +
                                ((npy_int *)data_out + 2*0)[0];
        ((npy_int *)data_out + 2*0)[1] =
                                ((npy_int *)data0 + 2*0)[1] +
                                ((npy_int *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_int *)data_out + 2*1)[0] =
                                ((npy_int *)data0 + 2*1)[0] +
                                ((npy_int *)data_out + 2*1)[0];
        ((npy_int *)data_out + 2*1)[1] =
                                ((npy_int *)data0 + 2*1)[1] +
                                ((npy_int *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_int *)data_out + 2*2)[0] =
                                ((npy_int *)data0 + 2*2)[0] +
                                ((npy_int *)data_out + 2*2)[0];
        ((npy_int *)data_out + 2*2)[1] =
                                ((npy_int *)data0 + 2*2)[1] +
                                ((npy_int *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_int *)data_out + 2*3)[0] =
                                ((npy_int *)data0 + 2*3)[0] +
                                ((npy_int *)data_out + 2*3)[0];
        ((npy_int *)data_out + 2*3)[1] =
                                ((npy_int *)data0 + 2*3)[1] +
                                ((npy_int *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_int *)data_out + 2*4)[0] =
                                ((npy_int *)data0 + 2*4)[0] +
                                ((npy_int *)data_out + 2*4)[0];
        ((npy_int *)data_out + 2*4)[1] =
                                ((npy_int *)data0 + 2*4)[1] +
                                ((npy_int *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_int *)data_out + 2*5)[0] =
                                ((npy_int *)data0 + 2*5)[0] +
                                ((npy_int *)data_out + 2*5)[0];
        ((npy_int *)data_out + 2*5)[1] =
                                ((npy_int *)data0 + 2*5)[1] +
                                ((npy_int *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_int *)data_out + 2*6)[0] =
                                ((npy_int *)data0 + 2*6)[0] +
                                ((npy_int *)data_out + 2*6)[0];
        ((npy_int *)data_out + 2*6)[1] =
                                ((npy_int *)data0 + 2*6)[1] +
                                ((npy_int *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_int *)data_out + 2*7)[0] =
                                ((npy_int *)data0 + 2*7)[0] +
                                ((npy_int *)data_out + 2*7)[0];
        ((npy_int *)data_out + 2*7)[1] =
                                ((npy_int *)data0 + 2*7)[1] +
                                ((npy_int *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1000 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
int_sum_of_products_muladd(npy_int *data, npy_int *data_out, npy_int scalar, npy_intp count)
{
#if 0 // NPYV check for npy_int
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s32;
    const npyv_s32 v_scalar = npyv_setall_s32(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s32 b0 = npyv_loada_s32(data + vstep * 0);
            npyv_s32 c0 = npyv_loada_s32(data_out + vstep * 0);
            
#line 312
            npyv_s32 b1 = npyv_loada_s32(data + vstep * 1);
            npyv_s32 c1 = npyv_loada_s32(data_out + vstep * 1);
            
#line 312
            npyv_s32 b2 = npyv_loada_s32(data + vstep * 2);
            npyv_s32 c2 = npyv_loada_s32(data_out + vstep * 2);
            
#line 312
            npyv_s32 b3 = npyv_loada_s32(data + vstep * 3);
            npyv_s32 c3 = npyv_loada_s32(data_out + vstep * 3);
            
            #line 318
            npyv_s32 abc0 = npyv_muladd_s32(v_scalar, b0, c0);
            
#line 318
            npyv_s32 abc1 = npyv_muladd_s32(v_scalar, b1, c1);
            
#line 318
            npyv_s32 abc2 = npyv_muladd_s32(v_scalar, b2, c2);
            
#line 318
            npyv_s32 abc3 = npyv_muladd_s32(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_s32(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_s32(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_s32(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_s32(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s32 b0 = npyv_load_s32(data + vstep * 0);
            npyv_s32 c0 = npyv_load_s32(data_out + vstep * 0);
            
#line 312
            npyv_s32 b1 = npyv_load_s32(data + vstep * 1);
            npyv_s32 c1 = npyv_load_s32(data_out + vstep * 1);
            
#line 312
            npyv_s32 b2 = npyv_load_s32(data + vstep * 2);
            npyv_s32 c2 = npyv_load_s32(data_out + vstep * 2);
            
#line 312
            npyv_s32 b3 = npyv_load_s32(data + vstep * 3);
            npyv_s32 c3 = npyv_load_s32(data_out + vstep * 3);
            
            #line 318
            npyv_s32 abc0 = npyv_muladd_s32(v_scalar, b0, c0);
            
#line 318
            npyv_s32 abc1 = npyv_muladd_s32(v_scalar, b1, c1);
            
#line 318
            npyv_s32 abc2 = npyv_muladd_s32(v_scalar, b2, c2);
            
#line 318
            npyv_s32 abc3 = npyv_muladd_s32(v_scalar, b3, c3);
            
            #line 323
            npyv_store_s32(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_s32(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_s32(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_s32(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_s32 a = npyv_load_tillz_s32(data, count);
        npyv_s32 b = npyv_load_tillz_s32(data_out, count);
        npyv_store_till_s32(data_out, count, npyv_muladd_s32(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_int b0 = (data[0]);
        const npy_int c0 = (data_out[0]);
        
#line 340
        const npy_int b1 = (data[1]);
        const npy_int c1 = (data_out[1]);
        
#line 340
        const npy_int b2 = (data[2]);
        const npy_int c2 = (data_out[2]);
        
#line 340
        const npy_int b3 = (data[3]);
        const npy_int c3 = (data_out[3]);
        
        #line 346
        const npy_int abc0 = scalar * b0 + c0;
        
#line 346
        const npy_int abc1 = scalar * b1 + c1;
        
#line 346
        const npy_int abc2 = scalar * b2 + c2;
        
#line 346
        const npy_int abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_int b = (*data);
        const npy_int c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_int
}

static void
int_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data_out = (npy_int *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_int
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s32;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s32 a0 = npyv_loada_s32(data0 + vstep * 0);
            npyv_s32 b0 = npyv_loada_s32(data1 + vstep * 0);
            npyv_s32 c0 = npyv_loada_s32(data_out + vstep * 0);
            
#line 390
            npyv_s32 a1 = npyv_loada_s32(data0 + vstep * 1);
            npyv_s32 b1 = npyv_loada_s32(data1 + vstep * 1);
            npyv_s32 c1 = npyv_loada_s32(data_out + vstep * 1);
            
#line 390
            npyv_s32 a2 = npyv_loada_s32(data0 + vstep * 2);
            npyv_s32 b2 = npyv_loada_s32(data1 + vstep * 2);
            npyv_s32 c2 = npyv_loada_s32(data_out + vstep * 2);
            
#line 390
            npyv_s32 a3 = npyv_loada_s32(data0 + vstep * 3);
            npyv_s32 b3 = npyv_loada_s32(data1 + vstep * 3);
            npyv_s32 c3 = npyv_loada_s32(data_out + vstep * 3);
            
            #line 397
            npyv_s32 abc0 = npyv_muladd_s32(a0, b0, c0);
            
#line 397
            npyv_s32 abc1 = npyv_muladd_s32(a1, b1, c1);
            
#line 397
            npyv_s32 abc2 = npyv_muladd_s32(a2, b2, c2);
            
#line 397
            npyv_s32 abc3 = npyv_muladd_s32(a3, b3, c3);
            
            #line 402
            npyv_storea_s32(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_s32(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_s32(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_s32(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s32 a0 = npyv_load_s32(data0 + vstep * 0);
            npyv_s32 b0 = npyv_load_s32(data1 + vstep * 0);
            npyv_s32 c0 = npyv_load_s32(data_out + vstep * 0);
            
#line 390
            npyv_s32 a1 = npyv_load_s32(data0 + vstep * 1);
            npyv_s32 b1 = npyv_load_s32(data1 + vstep * 1);
            npyv_s32 c1 = npyv_load_s32(data_out + vstep * 1);
            
#line 390
            npyv_s32 a2 = npyv_load_s32(data0 + vstep * 2);
            npyv_s32 b2 = npyv_load_s32(data1 + vstep * 2);
            npyv_s32 c2 = npyv_load_s32(data_out + vstep * 2);
            
#line 390
            npyv_s32 a3 = npyv_load_s32(data0 + vstep * 3);
            npyv_s32 b3 = npyv_load_s32(data1 + vstep * 3);
            npyv_s32 c3 = npyv_load_s32(data_out + vstep * 3);
            
            #line 397
            npyv_s32 abc0 = npyv_muladd_s32(a0, b0, c0);
            
#line 397
            npyv_s32 abc1 = npyv_muladd_s32(a1, b1, c1);
            
#line 397
            npyv_s32 abc2 = npyv_muladd_s32(a2, b2, c2);
            
#line 397
            npyv_s32 abc3 = npyv_muladd_s32(a3, b3, c3);
            
            #line 402
            npyv_store_s32(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_s32(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_s32(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_s32(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_s32 a = npyv_load_tillz_s32(data0, count);
        npyv_s32 b = npyv_load_tillz_s32(data1, count);
        npyv_s32 c = npyv_load_tillz_s32(data_out, count);
        npyv_store_till_s32(data_out, count, npyv_muladd_s32(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_int a0 = (data0[0]);
        const npy_int b0 = (data1[0]);
        const npy_int c0 = (data_out[0]);
        
#line 420
        const npy_int a1 = (data0[1]);
        const npy_int b1 = (data1[1]);
        const npy_int c1 = (data_out[1]);
        
#line 420
        const npy_int a2 = (data0[2]);
        const npy_int b2 = (data1[2]);
        const npy_int c2 = (data_out[2]);
        
#line 420
        const npy_int a3 = (data0[3]);
        const npy_int b3 = (data1[3]);
        const npy_int c3 = (data_out[3]);
        
        #line 427
        const npy_int abc0 = a0 * b0 + c0;
        
#line 427
        const npy_int abc1 = a1 * b1 + c1;
        
#line 427
        const npy_int abc2 = a2 * b2 + c2;
        
#line 427
        const npy_int abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_int a = (*data0);
        const npy_int b = (*data1);
        const npy_int c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_int

}

/* Some extra specializations for the two operand case */
static void
int_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int value0 = (*(npy_int *)dataptr[0]);
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data_out = (npy_int *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    int_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
int_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int value1 = (*(npy_int *)dataptr[1]);
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data_out = (npy_int *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    int_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
int_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int accum = 0;

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_int
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_s32;
    npyv_s32 vaccum = npyv_zero_s32();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s32 a0 = npyv_loada_s32(data0 + vstep * 0);
            npyv_s32 b0 = npyv_loada_s32(data1 + vstep * 0);
            
#line 501
            npyv_s32 a1 = npyv_loada_s32(data0 + vstep * 1);
            npyv_s32 b1 = npyv_loada_s32(data1 + vstep * 1);
            
#line 501
            npyv_s32 a2 = npyv_loada_s32(data0 + vstep * 2);
            npyv_s32 b2 = npyv_loada_s32(data1 + vstep * 2);
            
#line 501
            npyv_s32 a3 = npyv_loada_s32(data0 + vstep * 3);
            npyv_s32 b3 = npyv_loada_s32(data1 + vstep * 3);
            
            npyv_s32 ab3 = npyv_muladd_s32(a3, b3, vaccum);
            npyv_s32 ab2 = npyv_muladd_s32(a2, b2, ab3);
            npyv_s32 ab1 = npyv_muladd_s32(a1, b1, ab2);
                    vaccum = npyv_muladd_s32(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s32 a0 = npyv_load_s32(data0 + vstep * 0);
            npyv_s32 b0 = npyv_load_s32(data1 + vstep * 0);
            
#line 501
            npyv_s32 a1 = npyv_load_s32(data0 + vstep * 1);
            npyv_s32 b1 = npyv_load_s32(data1 + vstep * 1);
            
#line 501
            npyv_s32 a2 = npyv_load_s32(data0 + vstep * 2);
            npyv_s32 b2 = npyv_load_s32(data1 + vstep * 2);
            
#line 501
            npyv_s32 a3 = npyv_load_s32(data0 + vstep * 3);
            npyv_s32 b3 = npyv_load_s32(data1 + vstep * 3);
            
            npyv_s32 ab3 = npyv_muladd_s32(a3, b3, vaccum);
            npyv_s32 ab2 = npyv_muladd_s32(a2, b2, ab3);
            npyv_s32 ab1 = npyv_muladd_s32(a1, b1, ab2);
                    vaccum = npyv_muladd_s32(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_s32 a = npyv_load_tillz_s32(data0, count);
        npyv_s32 b = npyv_load_tillz_s32(data1, count);
        vaccum = npyv_muladd_s32(a, b, vaccum);
    }
    accum = npyv_sum_s32(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_int ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_int ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_int ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_int ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_int a = (*data0);
        const npy_int b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_int
    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
int_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int value0 = (*(npy_int *)dataptr[0]);
    npy_int accum = int_sum_of_arr(data1, count);
    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
int_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int value1 = (*(npy_int *)dataptr[1]);
    npy_int accum = int_sum_of_arr(data0, count);
    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + value1 * accum);
}

#elif 1000 == 3 && !0

static void
int_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_int *data0 = (npy_int *)dataptr[0];
    npy_int *data1 = (npy_int *)dataptr[1];
    npy_int *data2 = (npy_int *)dataptr[2];
    npy_int *data_out = (npy_int *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1000 > 3 || @complex */

static void
int_sum_of_products_contig_any(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        *(npy_int *)dataptr[nop] = (temp +
                                           (*(npy_int *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_int);
        }
#else /* complex */
#  if 1000 <= 3
#    define _SUMPROD_NOP 1000
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_int);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1000 */

#if 1000 == 1

static NPY_GCC_OPT_3 void
int_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_int *data = (npy_int *)dataptr[0];
    npy_int accum = int_sum_of_arr(data, count);
    *((npy_int *)dataptr[1]) = (accum + (*((npy_int *)dataptr[1])));
#else
    npy_int accum_re = 0, accum_im = 0;
    npy_int *data0 = (npy_int *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_int re01 = data0[0] + data0[2];
        const npy_int re23 = data0[4] + data0[6];
        const npy_int im13 = data0[1] + data0[3];
        const npy_int im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_int *)dataptr[1])[0] += accum_re;
    ((npy_int *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 1000 == 1 */

static void
int_sum_of_products_outstride0_any(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_int accum_re = 0, accum_im = 0;
#else
    npy_int accum = 0;
#endif

#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_outstride0_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        accum += (*(npy_int *)data0);
        data0 += stride0;
#  elif 1000 == 2
        accum += (*(npy_int *)data0) *
                 (*(npy_int *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1000 == 3
        accum += (*(npy_int *)data0) *
                 (*(npy_int *)data1) *
                 (*(npy_int *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_int temp = (*(npy_int *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_int *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        accum_re += ((npy_int *)data0)[0];
        accum_im += ((npy_int *)data0)[1];
        data0 += stride0;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_int re, im, tmp;
        int i;
        re = ((npy_int *)dataptr[0])[0];
        im = ((npy_int *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_int *)dataptr[i])[0] -
                  im * ((npy_int *)dataptr[i])[1];
            im = re * ((npy_int *)dataptr[i])[1] +
                 im * ((npy_int *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1000 <= 3
    ((npy_int *)dataptr[1000])[0] += accum_re;
    ((npy_int *)dataptr[1000])[1] += accum_im;
#  else
    ((npy_int *)dataptr[nop])[0] += accum_re;
    ((npy_int *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1000 <= 3
    *((npy_int *)dataptr[1000]) = (accum +
                                    (*((npy_int *)dataptr[1000])));
#  else
    *((npy_int *)dataptr[nop]) = (accum +
                                    (*((npy_int *)dataptr[nop])));
#  endif
#endif

}




#line 74

#if !0
static NPY_GCC_OPT_3 npy_long long_sum_of_arr(npy_long *data, npy_intp count)
{
    npy_long accum = 0;
#if 0 // NPYV check for npy_long
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data);
    const int vstep = npyv_nlanes_long;
    npyv_long vaccum = npyv_zero_long();
    const npy_intp vstepx4 = vstep * 4;

    #line 91
    if(is_aligned) {
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
            #line 96
            npyv_long a0 = npyv_loada_long(data + vstep * 0);
            
#line 96
            npyv_long a1 = npyv_loada_long(data + vstep * 1);
            
#line 96
            npyv_long a2 = npyv_loada_long(data + vstep * 2);
            
#line 96
            npyv_long a3 = npyv_loada_long(data + vstep * 3);
            
            npyv_long a01   = npyv_add_long(a0, a1);
            npyv_long a23   = npyv_add_long(a2, a3);
            npyv_long a0123 = npyv_add_long(a01, a23);
                      vaccum = npyv_add_long(a0123, vaccum);
        }
    }
    
#line 91
    else {
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
            #line 96
            npyv_long a0 = npyv_load_long(data + vstep * 0);
            
#line 96
            npyv_long a1 = npyv_load_long(data + vstep * 1);
            
#line 96
            npyv_long a2 = npyv_load_long(data + vstep * 2);
            
#line 96
            npyv_long a3 = npyv_load_long(data + vstep * 3);
            
            npyv_long a01   = npyv_add_long(a0, a1);
            npyv_long a23   = npyv_add_long(a2, a3);
            npyv_long a0123 = npyv_add_long(a01, a23);
                      vaccum = npyv_add_long(a0123, vaccum);
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep) {
        npyv_long a = npyv_load_tillz_long(data, count);
        vaccum = npyv_add_long(a, vaccum);
    }
    accum = npyv_sum_long(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data += 4) {
        const npy_long a01 = (*data) + (data[1]);
        const npy_long a23 = (data[2]) + (data[3]);
        accum +=  a01 + a23;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data++) {
        accum += (*data);
    }
#endif // NPYV check for npy_long
    return accum;
}
#endif

#line 131
static void
long_sum_of_products_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1 == 1) || (1 <= 3 && !0)
    char *data_out = dataptr[1];
    npy_intp stride_out = strides[1];
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_one (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        *(npy_long *)data_out = ((*(npy_long *)data0) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1 == 2
        *(npy_long *)data_out = ((*(npy_long *)data0) *
                                         (*(npy_long *)data1) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1 == 3
        *(npy_long *)data_out = ((*(npy_long *)data0) *
                                         (*(npy_long *)data1) *
                                         (*(npy_long *)data2) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        *(npy_long *)dataptr[nop] = (temp +
                                           (*(npy_long *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        ((npy_long *)data_out)[0] = ((npy_long *)data0)[0] +
                                         ((npy_long *)data_out)[0];
        ((npy_long *)data_out)[1] = ((npy_long *)data0)[1] +
                                         ((npy_long *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1 == 1

static void
long_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data_out = (npy_long *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_long *)data_out + 2*6)[0] =
                                    ((npy_long *)data0 + 2*6)[0] +
                                    ((npy_long *)data_out + 2*6)[0];
            ((npy_long *)data_out + 2*6)[1] =
                                    ((npy_long *)data0 + 2*6)[1] +
                                    ((npy_long *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_long *)data_out + 2*5)[0] =
                                    ((npy_long *)data0 + 2*5)[0] +
                                    ((npy_long *)data_out + 2*5)[0];
            ((npy_long *)data_out + 2*5)[1] =
                                    ((npy_long *)data0 + 2*5)[1] +
                                    ((npy_long *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_long *)data_out + 2*4)[0] =
                                    ((npy_long *)data0 + 2*4)[0] +
                                    ((npy_long *)data_out + 2*4)[0];
            ((npy_long *)data_out + 2*4)[1] =
                                    ((npy_long *)data0 + 2*4)[1] +
                                    ((npy_long *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_long *)data_out + 2*3)[0] =
                                    ((npy_long *)data0 + 2*3)[0] +
                                    ((npy_long *)data_out + 2*3)[0];
            ((npy_long *)data_out + 2*3)[1] =
                                    ((npy_long *)data0 + 2*3)[1] +
                                    ((npy_long *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_long *)data_out + 2*2)[0] =
                                    ((npy_long *)data0 + 2*2)[0] +
                                    ((npy_long *)data_out + 2*2)[0];
            ((npy_long *)data_out + 2*2)[1] =
                                    ((npy_long *)data0 + 2*2)[1] +
                                    ((npy_long *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_long *)data_out + 2*1)[0] =
                                    ((npy_long *)data0 + 2*1)[0] +
                                    ((npy_long *)data_out + 2*1)[0];
            ((npy_long *)data_out + 2*1)[1] =
                                    ((npy_long *)data0 + 2*1)[1] +
                                    ((npy_long *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_long *)data_out + 2*0)[0] =
                                    ((npy_long *)data0 + 2*0)[0] +
                                    ((npy_long *)data_out + 2*0)[0];
            ((npy_long *)data_out + 2*0)[1] =
                                    ((npy_long *)data0 + 2*0)[1] +
                                    ((npy_long *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_long *)data_out + 2*0)[0] =
                                ((npy_long *)data0 + 2*0)[0] +
                                ((npy_long *)data_out + 2*0)[0];
        ((npy_long *)data_out + 2*0)[1] =
                                ((npy_long *)data0 + 2*0)[1] +
                                ((npy_long *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_long *)data_out + 2*1)[0] =
                                ((npy_long *)data0 + 2*1)[0] +
                                ((npy_long *)data_out + 2*1)[0];
        ((npy_long *)data_out + 2*1)[1] =
                                ((npy_long *)data0 + 2*1)[1] +
                                ((npy_long *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_long *)data_out + 2*2)[0] =
                                ((npy_long *)data0 + 2*2)[0] +
                                ((npy_long *)data_out + 2*2)[0];
        ((npy_long *)data_out + 2*2)[1] =
                                ((npy_long *)data0 + 2*2)[1] +
                                ((npy_long *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_long *)data_out + 2*3)[0] =
                                ((npy_long *)data0 + 2*3)[0] +
                                ((npy_long *)data_out + 2*3)[0];
        ((npy_long *)data_out + 2*3)[1] =
                                ((npy_long *)data0 + 2*3)[1] +
                                ((npy_long *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_long *)data_out + 2*4)[0] =
                                ((npy_long *)data0 + 2*4)[0] +
                                ((npy_long *)data_out + 2*4)[0];
        ((npy_long *)data_out + 2*4)[1] =
                                ((npy_long *)data0 + 2*4)[1] +
                                ((npy_long *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_long *)data_out + 2*5)[0] =
                                ((npy_long *)data0 + 2*5)[0] +
                                ((npy_long *)data_out + 2*5)[0];
        ((npy_long *)data_out + 2*5)[1] =
                                ((npy_long *)data0 + 2*5)[1] +
                                ((npy_long *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_long *)data_out + 2*6)[0] =
                                ((npy_long *)data0 + 2*6)[0] +
                                ((npy_long *)data_out + 2*6)[0];
        ((npy_long *)data_out + 2*6)[1] =
                                ((npy_long *)data0 + 2*6)[1] +
                                ((npy_long *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_long *)data_out + 2*7)[0] =
                                ((npy_long *)data0 + 2*7)[0] +
                                ((npy_long *)data_out + 2*7)[0];
        ((npy_long *)data_out + 2*7)[1] =
                                ((npy_long *)data0 + 2*7)[1] +
                                ((npy_long *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
long_sum_of_products_muladd(npy_long *data, npy_long *data_out, npy_long scalar, npy_intp count)
{
#if 0 // NPYV check for npy_long
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_long;
    const npyv_long v_scalar = npyv_setall_long(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_long b0 = npyv_loada_long(data + vstep * 0);
            npyv_long c0 = npyv_loada_long(data_out + vstep * 0);
            
#line 312
            npyv_long b1 = npyv_loada_long(data + vstep * 1);
            npyv_long c1 = npyv_loada_long(data_out + vstep * 1);
            
#line 312
            npyv_long b2 = npyv_loada_long(data + vstep * 2);
            npyv_long c2 = npyv_loada_long(data_out + vstep * 2);
            
#line 312
            npyv_long b3 = npyv_loada_long(data + vstep * 3);
            npyv_long c3 = npyv_loada_long(data_out + vstep * 3);
            
            #line 318
            npyv_long abc0 = npyv_muladd_long(v_scalar, b0, c0);
            
#line 318
            npyv_long abc1 = npyv_muladd_long(v_scalar, b1, c1);
            
#line 318
            npyv_long abc2 = npyv_muladd_long(v_scalar, b2, c2);
            
#line 318
            npyv_long abc3 = npyv_muladd_long(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_long(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_long(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_long(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_long(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_long b0 = npyv_load_long(data + vstep * 0);
            npyv_long c0 = npyv_load_long(data_out + vstep * 0);
            
#line 312
            npyv_long b1 = npyv_load_long(data + vstep * 1);
            npyv_long c1 = npyv_load_long(data_out + vstep * 1);
            
#line 312
            npyv_long b2 = npyv_load_long(data + vstep * 2);
            npyv_long c2 = npyv_load_long(data_out + vstep * 2);
            
#line 312
            npyv_long b3 = npyv_load_long(data + vstep * 3);
            npyv_long c3 = npyv_load_long(data_out + vstep * 3);
            
            #line 318
            npyv_long abc0 = npyv_muladd_long(v_scalar, b0, c0);
            
#line 318
            npyv_long abc1 = npyv_muladd_long(v_scalar, b1, c1);
            
#line 318
            npyv_long abc2 = npyv_muladd_long(v_scalar, b2, c2);
            
#line 318
            npyv_long abc3 = npyv_muladd_long(v_scalar, b3, c3);
            
            #line 323
            npyv_store_long(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_long(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_long(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_long(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_long a = npyv_load_tillz_long(data, count);
        npyv_long b = npyv_load_tillz_long(data_out, count);
        npyv_store_till_long(data_out, count, npyv_muladd_long(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_long b0 = (data[0]);
        const npy_long c0 = (data_out[0]);
        
#line 340
        const npy_long b1 = (data[1]);
        const npy_long c1 = (data_out[1]);
        
#line 340
        const npy_long b2 = (data[2]);
        const npy_long c2 = (data_out[2]);
        
#line 340
        const npy_long b3 = (data[3]);
        const npy_long c3 = (data_out[3]);
        
        #line 346
        const npy_long abc0 = scalar * b0 + c0;
        
#line 346
        const npy_long abc1 = scalar * b1 + c1;
        
#line 346
        const npy_long abc2 = scalar * b2 + c2;
        
#line 346
        const npy_long abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_long b = (*data);
        const npy_long c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_long
}

static void
long_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data_out = (npy_long *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_long
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_long;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_long a0 = npyv_loada_long(data0 + vstep * 0);
            npyv_long b0 = npyv_loada_long(data1 + vstep * 0);
            npyv_long c0 = npyv_loada_long(data_out + vstep * 0);
            
#line 390
            npyv_long a1 = npyv_loada_long(data0 + vstep * 1);
            npyv_long b1 = npyv_loada_long(data1 + vstep * 1);
            npyv_long c1 = npyv_loada_long(data_out + vstep * 1);
            
#line 390
            npyv_long a2 = npyv_loada_long(data0 + vstep * 2);
            npyv_long b2 = npyv_loada_long(data1 + vstep * 2);
            npyv_long c2 = npyv_loada_long(data_out + vstep * 2);
            
#line 390
            npyv_long a3 = npyv_loada_long(data0 + vstep * 3);
            npyv_long b3 = npyv_loada_long(data1 + vstep * 3);
            npyv_long c3 = npyv_loada_long(data_out + vstep * 3);
            
            #line 397
            npyv_long abc0 = npyv_muladd_long(a0, b0, c0);
            
#line 397
            npyv_long abc1 = npyv_muladd_long(a1, b1, c1);
            
#line 397
            npyv_long abc2 = npyv_muladd_long(a2, b2, c2);
            
#line 397
            npyv_long abc3 = npyv_muladd_long(a3, b3, c3);
            
            #line 402
            npyv_storea_long(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_long(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_long(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_long(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_long a0 = npyv_load_long(data0 + vstep * 0);
            npyv_long b0 = npyv_load_long(data1 + vstep * 0);
            npyv_long c0 = npyv_load_long(data_out + vstep * 0);
            
#line 390
            npyv_long a1 = npyv_load_long(data0 + vstep * 1);
            npyv_long b1 = npyv_load_long(data1 + vstep * 1);
            npyv_long c1 = npyv_load_long(data_out + vstep * 1);
            
#line 390
            npyv_long a2 = npyv_load_long(data0 + vstep * 2);
            npyv_long b2 = npyv_load_long(data1 + vstep * 2);
            npyv_long c2 = npyv_load_long(data_out + vstep * 2);
            
#line 390
            npyv_long a3 = npyv_load_long(data0 + vstep * 3);
            npyv_long b3 = npyv_load_long(data1 + vstep * 3);
            npyv_long c3 = npyv_load_long(data_out + vstep * 3);
            
            #line 397
            npyv_long abc0 = npyv_muladd_long(a0, b0, c0);
            
#line 397
            npyv_long abc1 = npyv_muladd_long(a1, b1, c1);
            
#line 397
            npyv_long abc2 = npyv_muladd_long(a2, b2, c2);
            
#line 397
            npyv_long abc3 = npyv_muladd_long(a3, b3, c3);
            
            #line 402
            npyv_store_long(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_long(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_long(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_long(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_long a = npyv_load_tillz_long(data0, count);
        npyv_long b = npyv_load_tillz_long(data1, count);
        npyv_long c = npyv_load_tillz_long(data_out, count);
        npyv_store_till_long(data_out, count, npyv_muladd_long(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_long a0 = (data0[0]);
        const npy_long b0 = (data1[0]);
        const npy_long c0 = (data_out[0]);
        
#line 420
        const npy_long a1 = (data0[1]);
        const npy_long b1 = (data1[1]);
        const npy_long c1 = (data_out[1]);
        
#line 420
        const npy_long a2 = (data0[2]);
        const npy_long b2 = (data1[2]);
        const npy_long c2 = (data_out[2]);
        
#line 420
        const npy_long a3 = (data0[3]);
        const npy_long b3 = (data1[3]);
        const npy_long c3 = (data_out[3]);
        
        #line 427
        const npy_long abc0 = a0 * b0 + c0;
        
#line 427
        const npy_long abc1 = a1 * b1 + c1;
        
#line 427
        const npy_long abc2 = a2 * b2 + c2;
        
#line 427
        const npy_long abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_long a = (*data0);
        const npy_long b = (*data1);
        const npy_long c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_long

}

/* Some extra specializations for the two operand case */
static void
long_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long value0 = (*(npy_long *)dataptr[0]);
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data_out = (npy_long *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    long_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
long_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long value1 = (*(npy_long *)dataptr[1]);
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data_out = (npy_long *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    long_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
long_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long accum = 0;

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_long
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_long;
    npyv_long vaccum = npyv_zero_long();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_long a0 = npyv_loada_long(data0 + vstep * 0);
            npyv_long b0 = npyv_loada_long(data1 + vstep * 0);
            
#line 501
            npyv_long a1 = npyv_loada_long(data0 + vstep * 1);
            npyv_long b1 = npyv_loada_long(data1 + vstep * 1);
            
#line 501
            npyv_long a2 = npyv_loada_long(data0 + vstep * 2);
            npyv_long b2 = npyv_loada_long(data1 + vstep * 2);
            
#line 501
            npyv_long a3 = npyv_loada_long(data0 + vstep * 3);
            npyv_long b3 = npyv_loada_long(data1 + vstep * 3);
            
            npyv_long ab3 = npyv_muladd_long(a3, b3, vaccum);
            npyv_long ab2 = npyv_muladd_long(a2, b2, ab3);
            npyv_long ab1 = npyv_muladd_long(a1, b1, ab2);
                    vaccum = npyv_muladd_long(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_long a0 = npyv_load_long(data0 + vstep * 0);
            npyv_long b0 = npyv_load_long(data1 + vstep * 0);
            
#line 501
            npyv_long a1 = npyv_load_long(data0 + vstep * 1);
            npyv_long b1 = npyv_load_long(data1 + vstep * 1);
            
#line 501
            npyv_long a2 = npyv_load_long(data0 + vstep * 2);
            npyv_long b2 = npyv_load_long(data1 + vstep * 2);
            
#line 501
            npyv_long a3 = npyv_load_long(data0 + vstep * 3);
            npyv_long b3 = npyv_load_long(data1 + vstep * 3);
            
            npyv_long ab3 = npyv_muladd_long(a3, b3, vaccum);
            npyv_long ab2 = npyv_muladd_long(a2, b2, ab3);
            npyv_long ab1 = npyv_muladd_long(a1, b1, ab2);
                    vaccum = npyv_muladd_long(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_long a = npyv_load_tillz_long(data0, count);
        npyv_long b = npyv_load_tillz_long(data1, count);
        vaccum = npyv_muladd_long(a, b, vaccum);
    }
    accum = npyv_sum_long(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_long ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_long ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_long ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_long ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_long a = (*data0);
        const npy_long b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_long
    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
long_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long value0 = (*(npy_long *)dataptr[0]);
    npy_long accum = long_sum_of_arr(data1, count);
    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
long_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long value1 = (*(npy_long *)dataptr[1]);
    npy_long accum = long_sum_of_arr(data0, count);
    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + value1 * accum);
}

#elif 1 == 3 && !0

static void
long_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data2 = (npy_long *)dataptr[2];
    npy_long *data_out = (npy_long *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1 > 3 || @complex */

static void
long_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        *(npy_long *)dataptr[nop] = (temp +
                                           (*(npy_long *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_long);
        }
#else /* complex */
#  if 1 <= 3
#    define _SUMPROD_NOP 1
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_long);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1 */

#if 1 == 1

static NPY_GCC_OPT_3 void
long_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_long *data = (npy_long *)dataptr[0];
    npy_long accum = long_sum_of_arr(data, count);
    *((npy_long *)dataptr[1]) = (accum + (*((npy_long *)dataptr[1])));
#else
    npy_long accum_re = 0, accum_im = 0;
    npy_long *data0 = (npy_long *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_long re01 = data0[0] + data0[2];
        const npy_long re23 = data0[4] + data0[6];
        const npy_long im13 = data0[1] + data0[3];
        const npy_long im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_long *)dataptr[1])[0] += accum_re;
    ((npy_long *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 1 == 1 */

static void
long_sum_of_products_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_long accum_re = 0, accum_im = 0;
#else
    npy_long accum = 0;
#endif

#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_outstride0_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        accum += (*(npy_long *)data0);
        data0 += stride0;
#  elif 1 == 2
        accum += (*(npy_long *)data0) *
                 (*(npy_long *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1 == 3
        accum += (*(npy_long *)data0) *
                 (*(npy_long *)data1) *
                 (*(npy_long *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        accum_re += ((npy_long *)data0)[0];
        accum_im += ((npy_long *)data0)[1];
        data0 += stride0;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1 <= 3
    ((npy_long *)dataptr[1])[0] += accum_re;
    ((npy_long *)dataptr[1])[1] += accum_im;
#  else
    ((npy_long *)dataptr[nop])[0] += accum_re;
    ((npy_long *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1 <= 3
    *((npy_long *)dataptr[1]) = (accum +
                                    (*((npy_long *)dataptr[1])));
#  else
    *((npy_long *)dataptr[nop]) = (accum +
                                    (*((npy_long *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
long_sum_of_products_two(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (2 == 1) || (2 <= 3 && !0)
    char *data_out = dataptr[2];
    npy_intp stride_out = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_two (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        *(npy_long *)data_out = ((*(npy_long *)data0) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 2 == 2
        *(npy_long *)data_out = ((*(npy_long *)data0) *
                                         (*(npy_long *)data1) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 2 == 3
        *(npy_long *)data_out = ((*(npy_long *)data0) *
                                         (*(npy_long *)data1) *
                                         (*(npy_long *)data2) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        *(npy_long *)dataptr[nop] = (temp +
                                           (*(npy_long *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        ((npy_long *)data_out)[0] = ((npy_long *)data0)[0] +
                                         ((npy_long *)data_out)[0];
        ((npy_long *)data_out)[1] = ((npy_long *)data0)[1] +
                                         ((npy_long *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 2 == 1

static void
long_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data_out = (npy_long *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_long *)data_out + 2*6)[0] =
                                    ((npy_long *)data0 + 2*6)[0] +
                                    ((npy_long *)data_out + 2*6)[0];
            ((npy_long *)data_out + 2*6)[1] =
                                    ((npy_long *)data0 + 2*6)[1] +
                                    ((npy_long *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_long *)data_out + 2*5)[0] =
                                    ((npy_long *)data0 + 2*5)[0] +
                                    ((npy_long *)data_out + 2*5)[0];
            ((npy_long *)data_out + 2*5)[1] =
                                    ((npy_long *)data0 + 2*5)[1] +
                                    ((npy_long *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_long *)data_out + 2*4)[0] =
                                    ((npy_long *)data0 + 2*4)[0] +
                                    ((npy_long *)data_out + 2*4)[0];
            ((npy_long *)data_out + 2*4)[1] =
                                    ((npy_long *)data0 + 2*4)[1] +
                                    ((npy_long *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_long *)data_out + 2*3)[0] =
                                    ((npy_long *)data0 + 2*3)[0] +
                                    ((npy_long *)data_out + 2*3)[0];
            ((npy_long *)data_out + 2*3)[1] =
                                    ((npy_long *)data0 + 2*3)[1] +
                                    ((npy_long *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_long *)data_out + 2*2)[0] =
                                    ((npy_long *)data0 + 2*2)[0] +
                                    ((npy_long *)data_out + 2*2)[0];
            ((npy_long *)data_out + 2*2)[1] =
                                    ((npy_long *)data0 + 2*2)[1] +
                                    ((npy_long *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_long *)data_out + 2*1)[0] =
                                    ((npy_long *)data0 + 2*1)[0] +
                                    ((npy_long *)data_out + 2*1)[0];
            ((npy_long *)data_out + 2*1)[1] =
                                    ((npy_long *)data0 + 2*1)[1] +
                                    ((npy_long *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_long *)data_out + 2*0)[0] =
                                    ((npy_long *)data0 + 2*0)[0] +
                                    ((npy_long *)data_out + 2*0)[0];
            ((npy_long *)data_out + 2*0)[1] =
                                    ((npy_long *)data0 + 2*0)[1] +
                                    ((npy_long *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_long *)data_out + 2*0)[0] =
                                ((npy_long *)data0 + 2*0)[0] +
                                ((npy_long *)data_out + 2*0)[0];
        ((npy_long *)data_out + 2*0)[1] =
                                ((npy_long *)data0 + 2*0)[1] +
                                ((npy_long *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_long *)data_out + 2*1)[0] =
                                ((npy_long *)data0 + 2*1)[0] +
                                ((npy_long *)data_out + 2*1)[0];
        ((npy_long *)data_out + 2*1)[1] =
                                ((npy_long *)data0 + 2*1)[1] +
                                ((npy_long *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_long *)data_out + 2*2)[0] =
                                ((npy_long *)data0 + 2*2)[0] +
                                ((npy_long *)data_out + 2*2)[0];
        ((npy_long *)data_out + 2*2)[1] =
                                ((npy_long *)data0 + 2*2)[1] +
                                ((npy_long *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_long *)data_out + 2*3)[0] =
                                ((npy_long *)data0 + 2*3)[0] +
                                ((npy_long *)data_out + 2*3)[0];
        ((npy_long *)data_out + 2*3)[1] =
                                ((npy_long *)data0 + 2*3)[1] +
                                ((npy_long *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_long *)data_out + 2*4)[0] =
                                ((npy_long *)data0 + 2*4)[0] +
                                ((npy_long *)data_out + 2*4)[0];
        ((npy_long *)data_out + 2*4)[1] =
                                ((npy_long *)data0 + 2*4)[1] +
                                ((npy_long *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_long *)data_out + 2*5)[0] =
                                ((npy_long *)data0 + 2*5)[0] +
                                ((npy_long *)data_out + 2*5)[0];
        ((npy_long *)data_out + 2*5)[1] =
                                ((npy_long *)data0 + 2*5)[1] +
                                ((npy_long *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_long *)data_out + 2*6)[0] =
                                ((npy_long *)data0 + 2*6)[0] +
                                ((npy_long *)data_out + 2*6)[0];
        ((npy_long *)data_out + 2*6)[1] =
                                ((npy_long *)data0 + 2*6)[1] +
                                ((npy_long *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_long *)data_out + 2*7)[0] =
                                ((npy_long *)data0 + 2*7)[0] +
                                ((npy_long *)data_out + 2*7)[0];
        ((npy_long *)data_out + 2*7)[1] =
                                ((npy_long *)data0 + 2*7)[1] +
                                ((npy_long *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 2 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
long_sum_of_products_muladd(npy_long *data, npy_long *data_out, npy_long scalar, npy_intp count)
{
#if 0 // NPYV check for npy_long
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_long;
    const npyv_long v_scalar = npyv_setall_long(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_long b0 = npyv_loada_long(data + vstep * 0);
            npyv_long c0 = npyv_loada_long(data_out + vstep * 0);
            
#line 312
            npyv_long b1 = npyv_loada_long(data + vstep * 1);
            npyv_long c1 = npyv_loada_long(data_out + vstep * 1);
            
#line 312
            npyv_long b2 = npyv_loada_long(data + vstep * 2);
            npyv_long c2 = npyv_loada_long(data_out + vstep * 2);
            
#line 312
            npyv_long b3 = npyv_loada_long(data + vstep * 3);
            npyv_long c3 = npyv_loada_long(data_out + vstep * 3);
            
            #line 318
            npyv_long abc0 = npyv_muladd_long(v_scalar, b0, c0);
            
#line 318
            npyv_long abc1 = npyv_muladd_long(v_scalar, b1, c1);
            
#line 318
            npyv_long abc2 = npyv_muladd_long(v_scalar, b2, c2);
            
#line 318
            npyv_long abc3 = npyv_muladd_long(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_long(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_long(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_long(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_long(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_long b0 = npyv_load_long(data + vstep * 0);
            npyv_long c0 = npyv_load_long(data_out + vstep * 0);
            
#line 312
            npyv_long b1 = npyv_load_long(data + vstep * 1);
            npyv_long c1 = npyv_load_long(data_out + vstep * 1);
            
#line 312
            npyv_long b2 = npyv_load_long(data + vstep * 2);
            npyv_long c2 = npyv_load_long(data_out + vstep * 2);
            
#line 312
            npyv_long b3 = npyv_load_long(data + vstep * 3);
            npyv_long c3 = npyv_load_long(data_out + vstep * 3);
            
            #line 318
            npyv_long abc0 = npyv_muladd_long(v_scalar, b0, c0);
            
#line 318
            npyv_long abc1 = npyv_muladd_long(v_scalar, b1, c1);
            
#line 318
            npyv_long abc2 = npyv_muladd_long(v_scalar, b2, c2);
            
#line 318
            npyv_long abc3 = npyv_muladd_long(v_scalar, b3, c3);
            
            #line 323
            npyv_store_long(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_long(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_long(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_long(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_long a = npyv_load_tillz_long(data, count);
        npyv_long b = npyv_load_tillz_long(data_out, count);
        npyv_store_till_long(data_out, count, npyv_muladd_long(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_long b0 = (data[0]);
        const npy_long c0 = (data_out[0]);
        
#line 340
        const npy_long b1 = (data[1]);
        const npy_long c1 = (data_out[1]);
        
#line 340
        const npy_long b2 = (data[2]);
        const npy_long c2 = (data_out[2]);
        
#line 340
        const npy_long b3 = (data[3]);
        const npy_long c3 = (data_out[3]);
        
        #line 346
        const npy_long abc0 = scalar * b0 + c0;
        
#line 346
        const npy_long abc1 = scalar * b1 + c1;
        
#line 346
        const npy_long abc2 = scalar * b2 + c2;
        
#line 346
        const npy_long abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_long b = (*data);
        const npy_long c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_long
}

static void
long_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data_out = (npy_long *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_long
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_long;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_long a0 = npyv_loada_long(data0 + vstep * 0);
            npyv_long b0 = npyv_loada_long(data1 + vstep * 0);
            npyv_long c0 = npyv_loada_long(data_out + vstep * 0);
            
#line 390
            npyv_long a1 = npyv_loada_long(data0 + vstep * 1);
            npyv_long b1 = npyv_loada_long(data1 + vstep * 1);
            npyv_long c1 = npyv_loada_long(data_out + vstep * 1);
            
#line 390
            npyv_long a2 = npyv_loada_long(data0 + vstep * 2);
            npyv_long b2 = npyv_loada_long(data1 + vstep * 2);
            npyv_long c2 = npyv_loada_long(data_out + vstep * 2);
            
#line 390
            npyv_long a3 = npyv_loada_long(data0 + vstep * 3);
            npyv_long b3 = npyv_loada_long(data1 + vstep * 3);
            npyv_long c3 = npyv_loada_long(data_out + vstep * 3);
            
            #line 397
            npyv_long abc0 = npyv_muladd_long(a0, b0, c0);
            
#line 397
            npyv_long abc1 = npyv_muladd_long(a1, b1, c1);
            
#line 397
            npyv_long abc2 = npyv_muladd_long(a2, b2, c2);
            
#line 397
            npyv_long abc3 = npyv_muladd_long(a3, b3, c3);
            
            #line 402
            npyv_storea_long(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_long(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_long(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_long(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_long a0 = npyv_load_long(data0 + vstep * 0);
            npyv_long b0 = npyv_load_long(data1 + vstep * 0);
            npyv_long c0 = npyv_load_long(data_out + vstep * 0);
            
#line 390
            npyv_long a1 = npyv_load_long(data0 + vstep * 1);
            npyv_long b1 = npyv_load_long(data1 + vstep * 1);
            npyv_long c1 = npyv_load_long(data_out + vstep * 1);
            
#line 390
            npyv_long a2 = npyv_load_long(data0 + vstep * 2);
            npyv_long b2 = npyv_load_long(data1 + vstep * 2);
            npyv_long c2 = npyv_load_long(data_out + vstep * 2);
            
#line 390
            npyv_long a3 = npyv_load_long(data0 + vstep * 3);
            npyv_long b3 = npyv_load_long(data1 + vstep * 3);
            npyv_long c3 = npyv_load_long(data_out + vstep * 3);
            
            #line 397
            npyv_long abc0 = npyv_muladd_long(a0, b0, c0);
            
#line 397
            npyv_long abc1 = npyv_muladd_long(a1, b1, c1);
            
#line 397
            npyv_long abc2 = npyv_muladd_long(a2, b2, c2);
            
#line 397
            npyv_long abc3 = npyv_muladd_long(a3, b3, c3);
            
            #line 402
            npyv_store_long(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_long(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_long(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_long(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_long a = npyv_load_tillz_long(data0, count);
        npyv_long b = npyv_load_tillz_long(data1, count);
        npyv_long c = npyv_load_tillz_long(data_out, count);
        npyv_store_till_long(data_out, count, npyv_muladd_long(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_long a0 = (data0[0]);
        const npy_long b0 = (data1[0]);
        const npy_long c0 = (data_out[0]);
        
#line 420
        const npy_long a1 = (data0[1]);
        const npy_long b1 = (data1[1]);
        const npy_long c1 = (data_out[1]);
        
#line 420
        const npy_long a2 = (data0[2]);
        const npy_long b2 = (data1[2]);
        const npy_long c2 = (data_out[2]);
        
#line 420
        const npy_long a3 = (data0[3]);
        const npy_long b3 = (data1[3]);
        const npy_long c3 = (data_out[3]);
        
        #line 427
        const npy_long abc0 = a0 * b0 + c0;
        
#line 427
        const npy_long abc1 = a1 * b1 + c1;
        
#line 427
        const npy_long abc2 = a2 * b2 + c2;
        
#line 427
        const npy_long abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_long a = (*data0);
        const npy_long b = (*data1);
        const npy_long c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_long

}

/* Some extra specializations for the two operand case */
static void
long_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long value0 = (*(npy_long *)dataptr[0]);
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data_out = (npy_long *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    long_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
long_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long value1 = (*(npy_long *)dataptr[1]);
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data_out = (npy_long *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    long_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
long_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long accum = 0;

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_long
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_long;
    npyv_long vaccum = npyv_zero_long();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_long a0 = npyv_loada_long(data0 + vstep * 0);
            npyv_long b0 = npyv_loada_long(data1 + vstep * 0);
            
#line 501
            npyv_long a1 = npyv_loada_long(data0 + vstep * 1);
            npyv_long b1 = npyv_loada_long(data1 + vstep * 1);
            
#line 501
            npyv_long a2 = npyv_loada_long(data0 + vstep * 2);
            npyv_long b2 = npyv_loada_long(data1 + vstep * 2);
            
#line 501
            npyv_long a3 = npyv_loada_long(data0 + vstep * 3);
            npyv_long b3 = npyv_loada_long(data1 + vstep * 3);
            
            npyv_long ab3 = npyv_muladd_long(a3, b3, vaccum);
            npyv_long ab2 = npyv_muladd_long(a2, b2, ab3);
            npyv_long ab1 = npyv_muladd_long(a1, b1, ab2);
                    vaccum = npyv_muladd_long(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_long a0 = npyv_load_long(data0 + vstep * 0);
            npyv_long b0 = npyv_load_long(data1 + vstep * 0);
            
#line 501
            npyv_long a1 = npyv_load_long(data0 + vstep * 1);
            npyv_long b1 = npyv_load_long(data1 + vstep * 1);
            
#line 501
            npyv_long a2 = npyv_load_long(data0 + vstep * 2);
            npyv_long b2 = npyv_load_long(data1 + vstep * 2);
            
#line 501
            npyv_long a3 = npyv_load_long(data0 + vstep * 3);
            npyv_long b3 = npyv_load_long(data1 + vstep * 3);
            
            npyv_long ab3 = npyv_muladd_long(a3, b3, vaccum);
            npyv_long ab2 = npyv_muladd_long(a2, b2, ab3);
            npyv_long ab1 = npyv_muladd_long(a1, b1, ab2);
                    vaccum = npyv_muladd_long(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_long a = npyv_load_tillz_long(data0, count);
        npyv_long b = npyv_load_tillz_long(data1, count);
        vaccum = npyv_muladd_long(a, b, vaccum);
    }
    accum = npyv_sum_long(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_long ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_long ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_long ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_long ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_long a = (*data0);
        const npy_long b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_long
    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
long_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long value0 = (*(npy_long *)dataptr[0]);
    npy_long accum = long_sum_of_arr(data1, count);
    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
long_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long value1 = (*(npy_long *)dataptr[1]);
    npy_long accum = long_sum_of_arr(data0, count);
    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + value1 * accum);
}

#elif 2 == 3 && !0

static void
long_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data2 = (npy_long *)dataptr[2];
    npy_long *data_out = (npy_long *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 2 > 3 || @complex */

static void
long_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        *(npy_long *)dataptr[nop] = (temp +
                                           (*(npy_long *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_long);
        }
#else /* complex */
#  if 2 <= 3
#    define _SUMPROD_NOP 2
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_long);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 2 */

#if 2 == 1

static NPY_GCC_OPT_3 void
long_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_long *data = (npy_long *)dataptr[0];
    npy_long accum = long_sum_of_arr(data, count);
    *((npy_long *)dataptr[1]) = (accum + (*((npy_long *)dataptr[1])));
#else
    npy_long accum_re = 0, accum_im = 0;
    npy_long *data0 = (npy_long *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_long re01 = data0[0] + data0[2];
        const npy_long re23 = data0[4] + data0[6];
        const npy_long im13 = data0[1] + data0[3];
        const npy_long im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_long *)dataptr[1])[0] += accum_re;
    ((npy_long *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 2 == 1 */

static void
long_sum_of_products_outstride0_two(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_long accum_re = 0, accum_im = 0;
#else
    npy_long accum = 0;
#endif

#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_outstride0_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        accum += (*(npy_long *)data0);
        data0 += stride0;
#  elif 2 == 2
        accum += (*(npy_long *)data0) *
                 (*(npy_long *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 2 == 3
        accum += (*(npy_long *)data0) *
                 (*(npy_long *)data1) *
                 (*(npy_long *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        accum_re += ((npy_long *)data0)[0];
        accum_im += ((npy_long *)data0)[1];
        data0 += stride0;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 2 <= 3
    ((npy_long *)dataptr[2])[0] += accum_re;
    ((npy_long *)dataptr[2])[1] += accum_im;
#  else
    ((npy_long *)dataptr[nop])[0] += accum_re;
    ((npy_long *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 2 <= 3
    *((npy_long *)dataptr[2]) = (accum +
                                    (*((npy_long *)dataptr[2])));
#  else
    *((npy_long *)dataptr[nop]) = (accum +
                                    (*((npy_long *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
long_sum_of_products_three(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (3 == 1) || (3 <= 3 && !0)
    char *data_out = dataptr[3];
    npy_intp stride_out = strides[3];
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_three (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        *(npy_long *)data_out = ((*(npy_long *)data0) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 3 == 2
        *(npy_long *)data_out = ((*(npy_long *)data0) *
                                         (*(npy_long *)data1) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 3 == 3
        *(npy_long *)data_out = ((*(npy_long *)data0) *
                                         (*(npy_long *)data1) *
                                         (*(npy_long *)data2) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        *(npy_long *)dataptr[nop] = (temp +
                                           (*(npy_long *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        ((npy_long *)data_out)[0] = ((npy_long *)data0)[0] +
                                         ((npy_long *)data_out)[0];
        ((npy_long *)data_out)[1] = ((npy_long *)data0)[1] +
                                         ((npy_long *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 3 == 1

static void
long_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data_out = (npy_long *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_long *)data_out + 2*6)[0] =
                                    ((npy_long *)data0 + 2*6)[0] +
                                    ((npy_long *)data_out + 2*6)[0];
            ((npy_long *)data_out + 2*6)[1] =
                                    ((npy_long *)data0 + 2*6)[1] +
                                    ((npy_long *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_long *)data_out + 2*5)[0] =
                                    ((npy_long *)data0 + 2*5)[0] +
                                    ((npy_long *)data_out + 2*5)[0];
            ((npy_long *)data_out + 2*5)[1] =
                                    ((npy_long *)data0 + 2*5)[1] +
                                    ((npy_long *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_long *)data_out + 2*4)[0] =
                                    ((npy_long *)data0 + 2*4)[0] +
                                    ((npy_long *)data_out + 2*4)[0];
            ((npy_long *)data_out + 2*4)[1] =
                                    ((npy_long *)data0 + 2*4)[1] +
                                    ((npy_long *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_long *)data_out + 2*3)[0] =
                                    ((npy_long *)data0 + 2*3)[0] +
                                    ((npy_long *)data_out + 2*3)[0];
            ((npy_long *)data_out + 2*3)[1] =
                                    ((npy_long *)data0 + 2*3)[1] +
                                    ((npy_long *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_long *)data_out + 2*2)[0] =
                                    ((npy_long *)data0 + 2*2)[0] +
                                    ((npy_long *)data_out + 2*2)[0];
            ((npy_long *)data_out + 2*2)[1] =
                                    ((npy_long *)data0 + 2*2)[1] +
                                    ((npy_long *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_long *)data_out + 2*1)[0] =
                                    ((npy_long *)data0 + 2*1)[0] +
                                    ((npy_long *)data_out + 2*1)[0];
            ((npy_long *)data_out + 2*1)[1] =
                                    ((npy_long *)data0 + 2*1)[1] +
                                    ((npy_long *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_long *)data_out + 2*0)[0] =
                                    ((npy_long *)data0 + 2*0)[0] +
                                    ((npy_long *)data_out + 2*0)[0];
            ((npy_long *)data_out + 2*0)[1] =
                                    ((npy_long *)data0 + 2*0)[1] +
                                    ((npy_long *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_long *)data_out + 2*0)[0] =
                                ((npy_long *)data0 + 2*0)[0] +
                                ((npy_long *)data_out + 2*0)[0];
        ((npy_long *)data_out + 2*0)[1] =
                                ((npy_long *)data0 + 2*0)[1] +
                                ((npy_long *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_long *)data_out + 2*1)[0] =
                                ((npy_long *)data0 + 2*1)[0] +
                                ((npy_long *)data_out + 2*1)[0];
        ((npy_long *)data_out + 2*1)[1] =
                                ((npy_long *)data0 + 2*1)[1] +
                                ((npy_long *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_long *)data_out + 2*2)[0] =
                                ((npy_long *)data0 + 2*2)[0] +
                                ((npy_long *)data_out + 2*2)[0];
        ((npy_long *)data_out + 2*2)[1] =
                                ((npy_long *)data0 + 2*2)[1] +
                                ((npy_long *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_long *)data_out + 2*3)[0] =
                                ((npy_long *)data0 + 2*3)[0] +
                                ((npy_long *)data_out + 2*3)[0];
        ((npy_long *)data_out + 2*3)[1] =
                                ((npy_long *)data0 + 2*3)[1] +
                                ((npy_long *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_long *)data_out + 2*4)[0] =
                                ((npy_long *)data0 + 2*4)[0] +
                                ((npy_long *)data_out + 2*4)[0];
        ((npy_long *)data_out + 2*4)[1] =
                                ((npy_long *)data0 + 2*4)[1] +
                                ((npy_long *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_long *)data_out + 2*5)[0] =
                                ((npy_long *)data0 + 2*5)[0] +
                                ((npy_long *)data_out + 2*5)[0];
        ((npy_long *)data_out + 2*5)[1] =
                                ((npy_long *)data0 + 2*5)[1] +
                                ((npy_long *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_long *)data_out + 2*6)[0] =
                                ((npy_long *)data0 + 2*6)[0] +
                                ((npy_long *)data_out + 2*6)[0];
        ((npy_long *)data_out + 2*6)[1] =
                                ((npy_long *)data0 + 2*6)[1] +
                                ((npy_long *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_long *)data_out + 2*7)[0] =
                                ((npy_long *)data0 + 2*7)[0] +
                                ((npy_long *)data_out + 2*7)[0];
        ((npy_long *)data_out + 2*7)[1] =
                                ((npy_long *)data0 + 2*7)[1] +
                                ((npy_long *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 3 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
long_sum_of_products_muladd(npy_long *data, npy_long *data_out, npy_long scalar, npy_intp count)
{
#if 0 // NPYV check for npy_long
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_long;
    const npyv_long v_scalar = npyv_setall_long(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_long b0 = npyv_loada_long(data + vstep * 0);
            npyv_long c0 = npyv_loada_long(data_out + vstep * 0);
            
#line 312
            npyv_long b1 = npyv_loada_long(data + vstep * 1);
            npyv_long c1 = npyv_loada_long(data_out + vstep * 1);
            
#line 312
            npyv_long b2 = npyv_loada_long(data + vstep * 2);
            npyv_long c2 = npyv_loada_long(data_out + vstep * 2);
            
#line 312
            npyv_long b3 = npyv_loada_long(data + vstep * 3);
            npyv_long c3 = npyv_loada_long(data_out + vstep * 3);
            
            #line 318
            npyv_long abc0 = npyv_muladd_long(v_scalar, b0, c0);
            
#line 318
            npyv_long abc1 = npyv_muladd_long(v_scalar, b1, c1);
            
#line 318
            npyv_long abc2 = npyv_muladd_long(v_scalar, b2, c2);
            
#line 318
            npyv_long abc3 = npyv_muladd_long(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_long(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_long(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_long(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_long(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_long b0 = npyv_load_long(data + vstep * 0);
            npyv_long c0 = npyv_load_long(data_out + vstep * 0);
            
#line 312
            npyv_long b1 = npyv_load_long(data + vstep * 1);
            npyv_long c1 = npyv_load_long(data_out + vstep * 1);
            
#line 312
            npyv_long b2 = npyv_load_long(data + vstep * 2);
            npyv_long c2 = npyv_load_long(data_out + vstep * 2);
            
#line 312
            npyv_long b3 = npyv_load_long(data + vstep * 3);
            npyv_long c3 = npyv_load_long(data_out + vstep * 3);
            
            #line 318
            npyv_long abc0 = npyv_muladd_long(v_scalar, b0, c0);
            
#line 318
            npyv_long abc1 = npyv_muladd_long(v_scalar, b1, c1);
            
#line 318
            npyv_long abc2 = npyv_muladd_long(v_scalar, b2, c2);
            
#line 318
            npyv_long abc3 = npyv_muladd_long(v_scalar, b3, c3);
            
            #line 323
            npyv_store_long(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_long(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_long(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_long(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_long a = npyv_load_tillz_long(data, count);
        npyv_long b = npyv_load_tillz_long(data_out, count);
        npyv_store_till_long(data_out, count, npyv_muladd_long(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_long b0 = (data[0]);
        const npy_long c0 = (data_out[0]);
        
#line 340
        const npy_long b1 = (data[1]);
        const npy_long c1 = (data_out[1]);
        
#line 340
        const npy_long b2 = (data[2]);
        const npy_long c2 = (data_out[2]);
        
#line 340
        const npy_long b3 = (data[3]);
        const npy_long c3 = (data_out[3]);
        
        #line 346
        const npy_long abc0 = scalar * b0 + c0;
        
#line 346
        const npy_long abc1 = scalar * b1 + c1;
        
#line 346
        const npy_long abc2 = scalar * b2 + c2;
        
#line 346
        const npy_long abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_long b = (*data);
        const npy_long c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_long
}

static void
long_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data_out = (npy_long *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_long
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_long;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_long a0 = npyv_loada_long(data0 + vstep * 0);
            npyv_long b0 = npyv_loada_long(data1 + vstep * 0);
            npyv_long c0 = npyv_loada_long(data_out + vstep * 0);
            
#line 390
            npyv_long a1 = npyv_loada_long(data0 + vstep * 1);
            npyv_long b1 = npyv_loada_long(data1 + vstep * 1);
            npyv_long c1 = npyv_loada_long(data_out + vstep * 1);
            
#line 390
            npyv_long a2 = npyv_loada_long(data0 + vstep * 2);
            npyv_long b2 = npyv_loada_long(data1 + vstep * 2);
            npyv_long c2 = npyv_loada_long(data_out + vstep * 2);
            
#line 390
            npyv_long a3 = npyv_loada_long(data0 + vstep * 3);
            npyv_long b3 = npyv_loada_long(data1 + vstep * 3);
            npyv_long c3 = npyv_loada_long(data_out + vstep * 3);
            
            #line 397
            npyv_long abc0 = npyv_muladd_long(a0, b0, c0);
            
#line 397
            npyv_long abc1 = npyv_muladd_long(a1, b1, c1);
            
#line 397
            npyv_long abc2 = npyv_muladd_long(a2, b2, c2);
            
#line 397
            npyv_long abc3 = npyv_muladd_long(a3, b3, c3);
            
            #line 402
            npyv_storea_long(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_long(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_long(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_long(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_long a0 = npyv_load_long(data0 + vstep * 0);
            npyv_long b0 = npyv_load_long(data1 + vstep * 0);
            npyv_long c0 = npyv_load_long(data_out + vstep * 0);
            
#line 390
            npyv_long a1 = npyv_load_long(data0 + vstep * 1);
            npyv_long b1 = npyv_load_long(data1 + vstep * 1);
            npyv_long c1 = npyv_load_long(data_out + vstep * 1);
            
#line 390
            npyv_long a2 = npyv_load_long(data0 + vstep * 2);
            npyv_long b2 = npyv_load_long(data1 + vstep * 2);
            npyv_long c2 = npyv_load_long(data_out + vstep * 2);
            
#line 390
            npyv_long a3 = npyv_load_long(data0 + vstep * 3);
            npyv_long b3 = npyv_load_long(data1 + vstep * 3);
            npyv_long c3 = npyv_load_long(data_out + vstep * 3);
            
            #line 397
            npyv_long abc0 = npyv_muladd_long(a0, b0, c0);
            
#line 397
            npyv_long abc1 = npyv_muladd_long(a1, b1, c1);
            
#line 397
            npyv_long abc2 = npyv_muladd_long(a2, b2, c2);
            
#line 397
            npyv_long abc3 = npyv_muladd_long(a3, b3, c3);
            
            #line 402
            npyv_store_long(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_long(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_long(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_long(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_long a = npyv_load_tillz_long(data0, count);
        npyv_long b = npyv_load_tillz_long(data1, count);
        npyv_long c = npyv_load_tillz_long(data_out, count);
        npyv_store_till_long(data_out, count, npyv_muladd_long(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_long a0 = (data0[0]);
        const npy_long b0 = (data1[0]);
        const npy_long c0 = (data_out[0]);
        
#line 420
        const npy_long a1 = (data0[1]);
        const npy_long b1 = (data1[1]);
        const npy_long c1 = (data_out[1]);
        
#line 420
        const npy_long a2 = (data0[2]);
        const npy_long b2 = (data1[2]);
        const npy_long c2 = (data_out[2]);
        
#line 420
        const npy_long a3 = (data0[3]);
        const npy_long b3 = (data1[3]);
        const npy_long c3 = (data_out[3]);
        
        #line 427
        const npy_long abc0 = a0 * b0 + c0;
        
#line 427
        const npy_long abc1 = a1 * b1 + c1;
        
#line 427
        const npy_long abc2 = a2 * b2 + c2;
        
#line 427
        const npy_long abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_long a = (*data0);
        const npy_long b = (*data1);
        const npy_long c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_long

}

/* Some extra specializations for the two operand case */
static void
long_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long value0 = (*(npy_long *)dataptr[0]);
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data_out = (npy_long *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    long_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
long_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long value1 = (*(npy_long *)dataptr[1]);
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data_out = (npy_long *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    long_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
long_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long accum = 0;

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_long
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_long;
    npyv_long vaccum = npyv_zero_long();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_long a0 = npyv_loada_long(data0 + vstep * 0);
            npyv_long b0 = npyv_loada_long(data1 + vstep * 0);
            
#line 501
            npyv_long a1 = npyv_loada_long(data0 + vstep * 1);
            npyv_long b1 = npyv_loada_long(data1 + vstep * 1);
            
#line 501
            npyv_long a2 = npyv_loada_long(data0 + vstep * 2);
            npyv_long b2 = npyv_loada_long(data1 + vstep * 2);
            
#line 501
            npyv_long a3 = npyv_loada_long(data0 + vstep * 3);
            npyv_long b3 = npyv_loada_long(data1 + vstep * 3);
            
            npyv_long ab3 = npyv_muladd_long(a3, b3, vaccum);
            npyv_long ab2 = npyv_muladd_long(a2, b2, ab3);
            npyv_long ab1 = npyv_muladd_long(a1, b1, ab2);
                    vaccum = npyv_muladd_long(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_long a0 = npyv_load_long(data0 + vstep * 0);
            npyv_long b0 = npyv_load_long(data1 + vstep * 0);
            
#line 501
            npyv_long a1 = npyv_load_long(data0 + vstep * 1);
            npyv_long b1 = npyv_load_long(data1 + vstep * 1);
            
#line 501
            npyv_long a2 = npyv_load_long(data0 + vstep * 2);
            npyv_long b2 = npyv_load_long(data1 + vstep * 2);
            
#line 501
            npyv_long a3 = npyv_load_long(data0 + vstep * 3);
            npyv_long b3 = npyv_load_long(data1 + vstep * 3);
            
            npyv_long ab3 = npyv_muladd_long(a3, b3, vaccum);
            npyv_long ab2 = npyv_muladd_long(a2, b2, ab3);
            npyv_long ab1 = npyv_muladd_long(a1, b1, ab2);
                    vaccum = npyv_muladd_long(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_long a = npyv_load_tillz_long(data0, count);
        npyv_long b = npyv_load_tillz_long(data1, count);
        vaccum = npyv_muladd_long(a, b, vaccum);
    }
    accum = npyv_sum_long(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_long ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_long ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_long ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_long ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_long a = (*data0);
        const npy_long b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_long
    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
long_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long value0 = (*(npy_long *)dataptr[0]);
    npy_long accum = long_sum_of_arr(data1, count);
    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
long_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long value1 = (*(npy_long *)dataptr[1]);
    npy_long accum = long_sum_of_arr(data0, count);
    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + value1 * accum);
}

#elif 3 == 3 && !0

static void
long_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data2 = (npy_long *)dataptr[2];
    npy_long *data_out = (npy_long *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 3 > 3 || @complex */

static void
long_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        *(npy_long *)dataptr[nop] = (temp +
                                           (*(npy_long *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_long);
        }
#else /* complex */
#  if 3 <= 3
#    define _SUMPROD_NOP 3
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_long);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 3 */

#if 3 == 1

static NPY_GCC_OPT_3 void
long_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_long *data = (npy_long *)dataptr[0];
    npy_long accum = long_sum_of_arr(data, count);
    *((npy_long *)dataptr[1]) = (accum + (*((npy_long *)dataptr[1])));
#else
    npy_long accum_re = 0, accum_im = 0;
    npy_long *data0 = (npy_long *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_long re01 = data0[0] + data0[2];
        const npy_long re23 = data0[4] + data0[6];
        const npy_long im13 = data0[1] + data0[3];
        const npy_long im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_long *)dataptr[1])[0] += accum_re;
    ((npy_long *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 3 == 1 */

static void
long_sum_of_products_outstride0_three(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_long accum_re = 0, accum_im = 0;
#else
    npy_long accum = 0;
#endif

#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_outstride0_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        accum += (*(npy_long *)data0);
        data0 += stride0;
#  elif 3 == 2
        accum += (*(npy_long *)data0) *
                 (*(npy_long *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 3 == 3
        accum += (*(npy_long *)data0) *
                 (*(npy_long *)data1) *
                 (*(npy_long *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        accum_re += ((npy_long *)data0)[0];
        accum_im += ((npy_long *)data0)[1];
        data0 += stride0;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 3 <= 3
    ((npy_long *)dataptr[3])[0] += accum_re;
    ((npy_long *)dataptr[3])[1] += accum_im;
#  else
    ((npy_long *)dataptr[nop])[0] += accum_re;
    ((npy_long *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 3 <= 3
    *((npy_long *)dataptr[3]) = (accum +
                                    (*((npy_long *)dataptr[3])));
#  else
    *((npy_long *)dataptr[nop]) = (accum +
                                    (*((npy_long *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
long_sum_of_products_any(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data_out = dataptr[1000];
    npy_intp stride_out = strides[1000];
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_any (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        *(npy_long *)data_out = ((*(npy_long *)data0) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1000 == 2
        *(npy_long *)data_out = ((*(npy_long *)data0) *
                                         (*(npy_long *)data1) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1000 == 3
        *(npy_long *)data_out = ((*(npy_long *)data0) *
                                         (*(npy_long *)data1) *
                                         (*(npy_long *)data2) +
                                         (*(npy_long *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        *(npy_long *)dataptr[nop] = (temp +
                                           (*(npy_long *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        ((npy_long *)data_out)[0] = ((npy_long *)data0)[0] +
                                         ((npy_long *)data_out)[0];
        ((npy_long *)data_out)[1] = ((npy_long *)data0)[1] +
                                         ((npy_long *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1000 == 1

static void
long_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data_out = (npy_long *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_long *)data_out + 2*6)[0] =
                                    ((npy_long *)data0 + 2*6)[0] +
                                    ((npy_long *)data_out + 2*6)[0];
            ((npy_long *)data_out + 2*6)[1] =
                                    ((npy_long *)data0 + 2*6)[1] +
                                    ((npy_long *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_long *)data_out + 2*5)[0] =
                                    ((npy_long *)data0 + 2*5)[0] +
                                    ((npy_long *)data_out + 2*5)[0];
            ((npy_long *)data_out + 2*5)[1] =
                                    ((npy_long *)data0 + 2*5)[1] +
                                    ((npy_long *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_long *)data_out + 2*4)[0] =
                                    ((npy_long *)data0 + 2*4)[0] +
                                    ((npy_long *)data_out + 2*4)[0];
            ((npy_long *)data_out + 2*4)[1] =
                                    ((npy_long *)data0 + 2*4)[1] +
                                    ((npy_long *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_long *)data_out + 2*3)[0] =
                                    ((npy_long *)data0 + 2*3)[0] +
                                    ((npy_long *)data_out + 2*3)[0];
            ((npy_long *)data_out + 2*3)[1] =
                                    ((npy_long *)data0 + 2*3)[1] +
                                    ((npy_long *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_long *)data_out + 2*2)[0] =
                                    ((npy_long *)data0 + 2*2)[0] +
                                    ((npy_long *)data_out + 2*2)[0];
            ((npy_long *)data_out + 2*2)[1] =
                                    ((npy_long *)data0 + 2*2)[1] +
                                    ((npy_long *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_long *)data_out + 2*1)[0] =
                                    ((npy_long *)data0 + 2*1)[0] +
                                    ((npy_long *)data_out + 2*1)[0];
            ((npy_long *)data_out + 2*1)[1] =
                                    ((npy_long *)data0 + 2*1)[1] +
                                    ((npy_long *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_long *)data_out + 2*0)[0] =
                                    ((npy_long *)data0 + 2*0)[0] +
                                    ((npy_long *)data_out + 2*0)[0];
            ((npy_long *)data_out + 2*0)[1] =
                                    ((npy_long *)data0 + 2*0)[1] +
                                    ((npy_long *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_long *)data_out + 2*0)[0] =
                                ((npy_long *)data0 + 2*0)[0] +
                                ((npy_long *)data_out + 2*0)[0];
        ((npy_long *)data_out + 2*0)[1] =
                                ((npy_long *)data0 + 2*0)[1] +
                                ((npy_long *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_long *)data_out + 2*1)[0] =
                                ((npy_long *)data0 + 2*1)[0] +
                                ((npy_long *)data_out + 2*1)[0];
        ((npy_long *)data_out + 2*1)[1] =
                                ((npy_long *)data0 + 2*1)[1] +
                                ((npy_long *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_long *)data_out + 2*2)[0] =
                                ((npy_long *)data0 + 2*2)[0] +
                                ((npy_long *)data_out + 2*2)[0];
        ((npy_long *)data_out + 2*2)[1] =
                                ((npy_long *)data0 + 2*2)[1] +
                                ((npy_long *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_long *)data_out + 2*3)[0] =
                                ((npy_long *)data0 + 2*3)[0] +
                                ((npy_long *)data_out + 2*3)[0];
        ((npy_long *)data_out + 2*3)[1] =
                                ((npy_long *)data0 + 2*3)[1] +
                                ((npy_long *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_long *)data_out + 2*4)[0] =
                                ((npy_long *)data0 + 2*4)[0] +
                                ((npy_long *)data_out + 2*4)[0];
        ((npy_long *)data_out + 2*4)[1] =
                                ((npy_long *)data0 + 2*4)[1] +
                                ((npy_long *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_long *)data_out + 2*5)[0] =
                                ((npy_long *)data0 + 2*5)[0] +
                                ((npy_long *)data_out + 2*5)[0];
        ((npy_long *)data_out + 2*5)[1] =
                                ((npy_long *)data0 + 2*5)[1] +
                                ((npy_long *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_long *)data_out + 2*6)[0] =
                                ((npy_long *)data0 + 2*6)[0] +
                                ((npy_long *)data_out + 2*6)[0];
        ((npy_long *)data_out + 2*6)[1] =
                                ((npy_long *)data0 + 2*6)[1] +
                                ((npy_long *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_long *)data_out + 2*7)[0] =
                                ((npy_long *)data0 + 2*7)[0] +
                                ((npy_long *)data_out + 2*7)[0];
        ((npy_long *)data_out + 2*7)[1] =
                                ((npy_long *)data0 + 2*7)[1] +
                                ((npy_long *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1000 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
long_sum_of_products_muladd(npy_long *data, npy_long *data_out, npy_long scalar, npy_intp count)
{
#if 0 // NPYV check for npy_long
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_long;
    const npyv_long v_scalar = npyv_setall_long(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_long b0 = npyv_loada_long(data + vstep * 0);
            npyv_long c0 = npyv_loada_long(data_out + vstep * 0);
            
#line 312
            npyv_long b1 = npyv_loada_long(data + vstep * 1);
            npyv_long c1 = npyv_loada_long(data_out + vstep * 1);
            
#line 312
            npyv_long b2 = npyv_loada_long(data + vstep * 2);
            npyv_long c2 = npyv_loada_long(data_out + vstep * 2);
            
#line 312
            npyv_long b3 = npyv_loada_long(data + vstep * 3);
            npyv_long c3 = npyv_loada_long(data_out + vstep * 3);
            
            #line 318
            npyv_long abc0 = npyv_muladd_long(v_scalar, b0, c0);
            
#line 318
            npyv_long abc1 = npyv_muladd_long(v_scalar, b1, c1);
            
#line 318
            npyv_long abc2 = npyv_muladd_long(v_scalar, b2, c2);
            
#line 318
            npyv_long abc3 = npyv_muladd_long(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_long(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_long(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_long(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_long(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_long b0 = npyv_load_long(data + vstep * 0);
            npyv_long c0 = npyv_load_long(data_out + vstep * 0);
            
#line 312
            npyv_long b1 = npyv_load_long(data + vstep * 1);
            npyv_long c1 = npyv_load_long(data_out + vstep * 1);
            
#line 312
            npyv_long b2 = npyv_load_long(data + vstep * 2);
            npyv_long c2 = npyv_load_long(data_out + vstep * 2);
            
#line 312
            npyv_long b3 = npyv_load_long(data + vstep * 3);
            npyv_long c3 = npyv_load_long(data_out + vstep * 3);
            
            #line 318
            npyv_long abc0 = npyv_muladd_long(v_scalar, b0, c0);
            
#line 318
            npyv_long abc1 = npyv_muladd_long(v_scalar, b1, c1);
            
#line 318
            npyv_long abc2 = npyv_muladd_long(v_scalar, b2, c2);
            
#line 318
            npyv_long abc3 = npyv_muladd_long(v_scalar, b3, c3);
            
            #line 323
            npyv_store_long(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_long(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_long(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_long(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_long a = npyv_load_tillz_long(data, count);
        npyv_long b = npyv_load_tillz_long(data_out, count);
        npyv_store_till_long(data_out, count, npyv_muladd_long(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_long b0 = (data[0]);
        const npy_long c0 = (data_out[0]);
        
#line 340
        const npy_long b1 = (data[1]);
        const npy_long c1 = (data_out[1]);
        
#line 340
        const npy_long b2 = (data[2]);
        const npy_long c2 = (data_out[2]);
        
#line 340
        const npy_long b3 = (data[3]);
        const npy_long c3 = (data_out[3]);
        
        #line 346
        const npy_long abc0 = scalar * b0 + c0;
        
#line 346
        const npy_long abc1 = scalar * b1 + c1;
        
#line 346
        const npy_long abc2 = scalar * b2 + c2;
        
#line 346
        const npy_long abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_long b = (*data);
        const npy_long c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_long
}

static void
long_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data_out = (npy_long *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_long
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_long;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_long a0 = npyv_loada_long(data0 + vstep * 0);
            npyv_long b0 = npyv_loada_long(data1 + vstep * 0);
            npyv_long c0 = npyv_loada_long(data_out + vstep * 0);
            
#line 390
            npyv_long a1 = npyv_loada_long(data0 + vstep * 1);
            npyv_long b1 = npyv_loada_long(data1 + vstep * 1);
            npyv_long c1 = npyv_loada_long(data_out + vstep * 1);
            
#line 390
            npyv_long a2 = npyv_loada_long(data0 + vstep * 2);
            npyv_long b2 = npyv_loada_long(data1 + vstep * 2);
            npyv_long c2 = npyv_loada_long(data_out + vstep * 2);
            
#line 390
            npyv_long a3 = npyv_loada_long(data0 + vstep * 3);
            npyv_long b3 = npyv_loada_long(data1 + vstep * 3);
            npyv_long c3 = npyv_loada_long(data_out + vstep * 3);
            
            #line 397
            npyv_long abc0 = npyv_muladd_long(a0, b0, c0);
            
#line 397
            npyv_long abc1 = npyv_muladd_long(a1, b1, c1);
            
#line 397
            npyv_long abc2 = npyv_muladd_long(a2, b2, c2);
            
#line 397
            npyv_long abc3 = npyv_muladd_long(a3, b3, c3);
            
            #line 402
            npyv_storea_long(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_long(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_long(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_long(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_long a0 = npyv_load_long(data0 + vstep * 0);
            npyv_long b0 = npyv_load_long(data1 + vstep * 0);
            npyv_long c0 = npyv_load_long(data_out + vstep * 0);
            
#line 390
            npyv_long a1 = npyv_load_long(data0 + vstep * 1);
            npyv_long b1 = npyv_load_long(data1 + vstep * 1);
            npyv_long c1 = npyv_load_long(data_out + vstep * 1);
            
#line 390
            npyv_long a2 = npyv_load_long(data0 + vstep * 2);
            npyv_long b2 = npyv_load_long(data1 + vstep * 2);
            npyv_long c2 = npyv_load_long(data_out + vstep * 2);
            
#line 390
            npyv_long a3 = npyv_load_long(data0 + vstep * 3);
            npyv_long b3 = npyv_load_long(data1 + vstep * 3);
            npyv_long c3 = npyv_load_long(data_out + vstep * 3);
            
            #line 397
            npyv_long abc0 = npyv_muladd_long(a0, b0, c0);
            
#line 397
            npyv_long abc1 = npyv_muladd_long(a1, b1, c1);
            
#line 397
            npyv_long abc2 = npyv_muladd_long(a2, b2, c2);
            
#line 397
            npyv_long abc3 = npyv_muladd_long(a3, b3, c3);
            
            #line 402
            npyv_store_long(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_long(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_long(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_long(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_long a = npyv_load_tillz_long(data0, count);
        npyv_long b = npyv_load_tillz_long(data1, count);
        npyv_long c = npyv_load_tillz_long(data_out, count);
        npyv_store_till_long(data_out, count, npyv_muladd_long(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_long a0 = (data0[0]);
        const npy_long b0 = (data1[0]);
        const npy_long c0 = (data_out[0]);
        
#line 420
        const npy_long a1 = (data0[1]);
        const npy_long b1 = (data1[1]);
        const npy_long c1 = (data_out[1]);
        
#line 420
        const npy_long a2 = (data0[2]);
        const npy_long b2 = (data1[2]);
        const npy_long c2 = (data_out[2]);
        
#line 420
        const npy_long a3 = (data0[3]);
        const npy_long b3 = (data1[3]);
        const npy_long c3 = (data_out[3]);
        
        #line 427
        const npy_long abc0 = a0 * b0 + c0;
        
#line 427
        const npy_long abc1 = a1 * b1 + c1;
        
#line 427
        const npy_long abc2 = a2 * b2 + c2;
        
#line 427
        const npy_long abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_long a = (*data0);
        const npy_long b = (*data1);
        const npy_long c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_long

}

/* Some extra specializations for the two operand case */
static void
long_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long value0 = (*(npy_long *)dataptr[0]);
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data_out = (npy_long *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    long_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
long_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long value1 = (*(npy_long *)dataptr[1]);
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data_out = (npy_long *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    long_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
long_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long accum = 0;

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_long
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_long;
    npyv_long vaccum = npyv_zero_long();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_long a0 = npyv_loada_long(data0 + vstep * 0);
            npyv_long b0 = npyv_loada_long(data1 + vstep * 0);
            
#line 501
            npyv_long a1 = npyv_loada_long(data0 + vstep * 1);
            npyv_long b1 = npyv_loada_long(data1 + vstep * 1);
            
#line 501
            npyv_long a2 = npyv_loada_long(data0 + vstep * 2);
            npyv_long b2 = npyv_loada_long(data1 + vstep * 2);
            
#line 501
            npyv_long a3 = npyv_loada_long(data0 + vstep * 3);
            npyv_long b3 = npyv_loada_long(data1 + vstep * 3);
            
            npyv_long ab3 = npyv_muladd_long(a3, b3, vaccum);
            npyv_long ab2 = npyv_muladd_long(a2, b2, ab3);
            npyv_long ab1 = npyv_muladd_long(a1, b1, ab2);
                    vaccum = npyv_muladd_long(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_long a0 = npyv_load_long(data0 + vstep * 0);
            npyv_long b0 = npyv_load_long(data1 + vstep * 0);
            
#line 501
            npyv_long a1 = npyv_load_long(data0 + vstep * 1);
            npyv_long b1 = npyv_load_long(data1 + vstep * 1);
            
#line 501
            npyv_long a2 = npyv_load_long(data0 + vstep * 2);
            npyv_long b2 = npyv_load_long(data1 + vstep * 2);
            
#line 501
            npyv_long a3 = npyv_load_long(data0 + vstep * 3);
            npyv_long b3 = npyv_load_long(data1 + vstep * 3);
            
            npyv_long ab3 = npyv_muladd_long(a3, b3, vaccum);
            npyv_long ab2 = npyv_muladd_long(a2, b2, ab3);
            npyv_long ab1 = npyv_muladd_long(a1, b1, ab2);
                    vaccum = npyv_muladd_long(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_long a = npyv_load_tillz_long(data0, count);
        npyv_long b = npyv_load_tillz_long(data1, count);
        vaccum = npyv_muladd_long(a, b, vaccum);
    }
    accum = npyv_sum_long(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_long ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_long ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_long ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_long ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_long a = (*data0);
        const npy_long b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_long
    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
long_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long value0 = (*(npy_long *)dataptr[0]);
    npy_long accum = long_sum_of_arr(data1, count);
    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
long_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long value1 = (*(npy_long *)dataptr[1]);
    npy_long accum = long_sum_of_arr(data0, count);
    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + value1 * accum);
}

#elif 1000 == 3 && !0

static void
long_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_long *data0 = (npy_long *)dataptr[0];
    npy_long *data1 = (npy_long *)dataptr[1];
    npy_long *data2 = (npy_long *)dataptr[2];
    npy_long *data_out = (npy_long *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1000 > 3 || @complex */

static void
long_sum_of_products_contig_any(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        *(npy_long *)dataptr[nop] = (temp +
                                           (*(npy_long *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_long);
        }
#else /* complex */
#  if 1000 <= 3
#    define _SUMPROD_NOP 1000
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_long);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1000 */

#if 1000 == 1

static NPY_GCC_OPT_3 void
long_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_long *data = (npy_long *)dataptr[0];
    npy_long accum = long_sum_of_arr(data, count);
    *((npy_long *)dataptr[1]) = (accum + (*((npy_long *)dataptr[1])));
#else
    npy_long accum_re = 0, accum_im = 0;
    npy_long *data0 = (npy_long *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_long re01 = data0[0] + data0[2];
        const npy_long re23 = data0[4] + data0[6];
        const npy_long im13 = data0[1] + data0[3];
        const npy_long im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_long *)dataptr[1])[0] += accum_re;
    ((npy_long *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 1000 == 1 */

static void
long_sum_of_products_outstride0_any(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_long accum_re = 0, accum_im = 0;
#else
    npy_long accum = 0;
#endif

#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_outstride0_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        accum += (*(npy_long *)data0);
        data0 += stride0;
#  elif 1000 == 2
        accum += (*(npy_long *)data0) *
                 (*(npy_long *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1000 == 3
        accum += (*(npy_long *)data0) *
                 (*(npy_long *)data1) *
                 (*(npy_long *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_long temp = (*(npy_long *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_long *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        accum_re += ((npy_long *)data0)[0];
        accum_im += ((npy_long *)data0)[1];
        data0 += stride0;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_long re, im, tmp;
        int i;
        re = ((npy_long *)dataptr[0])[0];
        im = ((npy_long *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_long *)dataptr[i])[0] -
                  im * ((npy_long *)dataptr[i])[1];
            im = re * ((npy_long *)dataptr[i])[1] +
                 im * ((npy_long *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1000 <= 3
    ((npy_long *)dataptr[1000])[0] += accum_re;
    ((npy_long *)dataptr[1000])[1] += accum_im;
#  else
    ((npy_long *)dataptr[nop])[0] += accum_re;
    ((npy_long *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1000 <= 3
    *((npy_long *)dataptr[1000]) = (accum +
                                    (*((npy_long *)dataptr[1000])));
#  else
    *((npy_long *)dataptr[nop]) = (accum +
                                    (*((npy_long *)dataptr[nop])));
#  endif
#endif

}




#line 74

#if !0
static NPY_GCC_OPT_3 npy_longlong longlong_sum_of_arr(npy_longlong *data, npy_intp count)
{
    npy_longlong accum = 0;
#if 0 // NPYV check for npy_longlong
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data);
    const int vstep = npyv_nlanes_s64;
    npyv_s64 vaccum = npyv_zero_s64();
    const npy_intp vstepx4 = vstep * 4;

    #line 91
    if(is_aligned) {
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
            #line 96
            npyv_s64 a0 = npyv_loada_s64(data + vstep * 0);
            
#line 96
            npyv_s64 a1 = npyv_loada_s64(data + vstep * 1);
            
#line 96
            npyv_s64 a2 = npyv_loada_s64(data + vstep * 2);
            
#line 96
            npyv_s64 a3 = npyv_loada_s64(data + vstep * 3);
            
            npyv_s64 a01   = npyv_add_s64(a0, a1);
            npyv_s64 a23   = npyv_add_s64(a2, a3);
            npyv_s64 a0123 = npyv_add_s64(a01, a23);
                      vaccum = npyv_add_s64(a0123, vaccum);
        }
    }
    
#line 91
    else {
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
            #line 96
            npyv_s64 a0 = npyv_load_s64(data + vstep * 0);
            
#line 96
            npyv_s64 a1 = npyv_load_s64(data + vstep * 1);
            
#line 96
            npyv_s64 a2 = npyv_load_s64(data + vstep * 2);
            
#line 96
            npyv_s64 a3 = npyv_load_s64(data + vstep * 3);
            
            npyv_s64 a01   = npyv_add_s64(a0, a1);
            npyv_s64 a23   = npyv_add_s64(a2, a3);
            npyv_s64 a0123 = npyv_add_s64(a01, a23);
                      vaccum = npyv_add_s64(a0123, vaccum);
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep) {
        npyv_s64 a = npyv_load_tillz_s64(data, count);
        vaccum = npyv_add_s64(a, vaccum);
    }
    accum = npyv_sum_s64(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data += 4) {
        const npy_longlong a01 = (*data) + (data[1]);
        const npy_longlong a23 = (data[2]) + (data[3]);
        accum +=  a01 + a23;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data++) {
        accum += (*data);
    }
#endif // NPYV check for npy_longlong
    return accum;
}
#endif

#line 131
static void
longlong_sum_of_products_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1 == 1) || (1 <= 3 && !0)
    char *data_out = dataptr[1];
    npy_intp stride_out = strides[1];
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_one (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1 == 2
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
                                         (*(npy_longlong *)data1) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1 == 3
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
                                         (*(npy_longlong *)data1) *
                                         (*(npy_longlong *)data2) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_longlong *)dataptr[i]);
        }
        *(npy_longlong *)dataptr[nop] = (temp +
                                           (*(npy_longlong *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        ((npy_longlong *)data_out)[0] = ((npy_longlong *)data0)[0] +
                                         ((npy_longlong *)data_out)[0];
        ((npy_longlong *)data_out)[1] = ((npy_longlong *)data0)[1] +
                                         ((npy_longlong *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_longlong re, im, tmp;
        int i;
        re = ((npy_longlong *)dataptr[0])[0];
        im = ((npy_longlong *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_longlong *)dataptr[i])[0] -
                  im * ((npy_longlong *)dataptr[i])[1];
            im = re * ((npy_longlong *)dataptr[i])[1] +
                 im * ((npy_longlong *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1 == 1

static void
longlong_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data_out = (npy_longlong *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_longlong *)data_out + 2*6)[0] =
                                    ((npy_longlong *)data0 + 2*6)[0] +
                                    ((npy_longlong *)data_out + 2*6)[0];
            ((npy_longlong *)data_out + 2*6)[1] =
                                    ((npy_longlong *)data0 + 2*6)[1] +
                                    ((npy_longlong *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_longlong *)data_out + 2*5)[0] =
                                    ((npy_longlong *)data0 + 2*5)[0] +
                                    ((npy_longlong *)data_out + 2*5)[0];
            ((npy_longlong *)data_out + 2*5)[1] =
                                    ((npy_longlong *)data0 + 2*5)[1] +
                                    ((npy_longlong *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_longlong *)data_out + 2*4)[0] =
                                    ((npy_longlong *)data0 + 2*4)[0] +
                                    ((npy_longlong *)data_out + 2*4)[0];
            ((npy_longlong *)data_out + 2*4)[1] =
                                    ((npy_longlong *)data0 + 2*4)[1] +
                                    ((npy_longlong *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_longlong *)data_out + 2*3)[0] =
                                    ((npy_longlong *)data0 + 2*3)[0] +
                                    ((npy_longlong *)data_out + 2*3)[0];
            ((npy_longlong *)data_out + 2*3)[1] =
                                    ((npy_longlong *)data0 + 2*3)[1] +
                                    ((npy_longlong *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_longlong *)data_out + 2*2)[0] =
                                    ((npy_longlong *)data0 + 2*2)[0] +
                                    ((npy_longlong *)data_out + 2*2)[0];
            ((npy_longlong *)data_out + 2*2)[1] =
                                    ((npy_longlong *)data0 + 2*2)[1] +
                                    ((npy_longlong *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_longlong *)data_out + 2*1)[0] =
                                    ((npy_longlong *)data0 + 2*1)[0] +
                                    ((npy_longlong *)data_out + 2*1)[0];
            ((npy_longlong *)data_out + 2*1)[1] =
                                    ((npy_longlong *)data0 + 2*1)[1] +
                                    ((npy_longlong *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_longlong *)data_out + 2*0)[0] =
                                    ((npy_longlong *)data0 + 2*0)[0] +
                                    ((npy_longlong *)data_out + 2*0)[0];
            ((npy_longlong *)data_out + 2*0)[1] =
                                    ((npy_longlong *)data0 + 2*0)[1] +
                                    ((npy_longlong *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_longlong *)data_out + 2*0)[0] =
                                ((npy_longlong *)data0 + 2*0)[0] +
                                ((npy_longlong *)data_out + 2*0)[0];
        ((npy_longlong *)data_out + 2*0)[1] =
                                ((npy_longlong *)data0 + 2*0)[1] +
                                ((npy_longlong *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_longlong *)data_out + 2*1)[0] =
                                ((npy_longlong *)data0 + 2*1)[0] +
                                ((npy_longlong *)data_out + 2*1)[0];
        ((npy_longlong *)data_out + 2*1)[1] =
                                ((npy_longlong *)data0 + 2*1)[1] +
                                ((npy_longlong *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_longlong *)data_out + 2*2)[0] =
                                ((npy_longlong *)data0 + 2*2)[0] +
                                ((npy_longlong *)data_out + 2*2)[0];
        ((npy_longlong *)data_out + 2*2)[1] =
                                ((npy_longlong *)data0 + 2*2)[1] +
                                ((npy_longlong *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_longlong *)data_out + 2*3)[0] =
                                ((npy_longlong *)data0 + 2*3)[0] +
                                ((npy_longlong *)data_out + 2*3)[0];
        ((npy_longlong *)data_out + 2*3)[1] =
                                ((npy_longlong *)data0 + 2*3)[1] +
                                ((npy_longlong *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_longlong *)data_out + 2*4)[0] =
                                ((npy_longlong *)data0 + 2*4)[0] +
                                ((npy_longlong *)data_out + 2*4)[0];
        ((npy_longlong *)data_out + 2*4)[1] =
                                ((npy_longlong *)data0 + 2*4)[1] +
                                ((npy_longlong *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_longlong *)data_out + 2*5)[0] =
                                ((npy_longlong *)data0 + 2*5)[0] +
                                ((npy_longlong *)data_out + 2*5)[0];
        ((npy_longlong *)data_out + 2*5)[1] =
                                ((npy_longlong *)data0 + 2*5)[1] +
                                ((npy_longlong *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_longlong *)data_out + 2*6)[0] =
                                ((npy_longlong *)data0 + 2*6)[0] +
                                ((npy_longlong *)data_out + 2*6)[0];
        ((npy_longlong *)data_out + 2*6)[1] =
                                ((npy_longlong *)data0 + 2*6)[1] +
                                ((npy_longlong *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_longlong *)data_out + 2*7)[0] =
                                ((npy_longlong *)data0 + 2*7)[0] +
                                ((npy_longlong *)data_out + 2*7)[0];
        ((npy_longlong *)data_out + 2*7)[1] =
                                ((npy_longlong *)data0 + 2*7)[1] +
                                ((npy_longlong *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
longlong_sum_of_products_muladd(npy_longlong *data, npy_longlong *data_out, npy_longlong scalar, npy_intp count)
{
#if 0 // NPYV check for npy_longlong
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s64;
    const npyv_s64 v_scalar = npyv_setall_s64(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s64 b0 = npyv_loada_s64(data + vstep * 0);
            npyv_s64 c0 = npyv_loada_s64(data_out + vstep * 0);
            
#line 312
            npyv_s64 b1 = npyv_loada_s64(data + vstep * 1);
            npyv_s64 c1 = npyv_loada_s64(data_out + vstep * 1);
            
#line 312
            npyv_s64 b2 = npyv_loada_s64(data + vstep * 2);
            npyv_s64 c2 = npyv_loada_s64(data_out + vstep * 2);
            
#line 312
            npyv_s64 b3 = npyv_loada_s64(data + vstep * 3);
            npyv_s64 c3 = npyv_loada_s64(data_out + vstep * 3);
            
            #line 318
            npyv_s64 abc0 = npyv_muladd_s64(v_scalar, b0, c0);
            
#line 318
            npyv_s64 abc1 = npyv_muladd_s64(v_scalar, b1, c1);
            
#line 318
            npyv_s64 abc2 = npyv_muladd_s64(v_scalar, b2, c2);
            
#line 318
            npyv_s64 abc3 = npyv_muladd_s64(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_s64(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_s64(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_s64(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_s64(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s64 b0 = npyv_load_s64(data + vstep * 0);
            npyv_s64 c0 = npyv_load_s64(data_out + vstep * 0);
            
#line 312
            npyv_s64 b1 = npyv_load_s64(data + vstep * 1);
            npyv_s64 c1 = npyv_load_s64(data_out + vstep * 1);
            
#line 312
            npyv_s64 b2 = npyv_load_s64(data + vstep * 2);
            npyv_s64 c2 = npyv_load_s64(data_out + vstep * 2);
            
#line 312
            npyv_s64 b3 = npyv_load_s64(data + vstep * 3);
            npyv_s64 c3 = npyv_load_s64(data_out + vstep * 3);
            
            #line 318
            npyv_s64 abc0 = npyv_muladd_s64(v_scalar, b0, c0);
            
#line 318
            npyv_s64 abc1 = npyv_muladd_s64(v_scalar, b1, c1);
            
#line 318
            npyv_s64 abc2 = npyv_muladd_s64(v_scalar, b2, c2);
            
#line 318
            npyv_s64 abc3 = npyv_muladd_s64(v_scalar, b3, c3);
            
            #line 323
            npyv_store_s64(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_s64(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_s64(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_s64(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_s64 a = npyv_load_tillz_s64(data, count);
        npyv_s64 b = npyv_load_tillz_s64(data_out, count);
        npyv_store_till_s64(data_out, count, npyv_muladd_s64(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_longlong b0 = (data[0]);
        const npy_longlong c0 = (data_out[0]);
        
#line 340
        const npy_longlong b1 = (data[1]);
        const npy_longlong c1 = (data_out[1]);
        
#line 340
        const npy_longlong b2 = (data[2]);
        const npy_longlong c2 = (data_out[2]);
        
#line 340
        const npy_longlong b3 = (data[3]);
        const npy_longlong c3 = (data_out[3]);
        
        #line 346
        const npy_longlong abc0 = scalar * b0 + c0;
        
#line 346
        const npy_longlong abc1 = scalar * b1 + c1;
        
#line 346
        const npy_longlong abc2 = scalar * b2 + c2;
        
#line 346
        const npy_longlong abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_longlong b = (*data);
        const npy_longlong c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_longlong
}

static void
longlong_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data_out = (npy_longlong *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_longlong
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s64;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s64 a0 = npyv_loada_s64(data0 + vstep * 0);
            npyv_s64 b0 = npyv_loada_s64(data1 + vstep * 0);
            npyv_s64 c0 = npyv_loada_s64(data_out + vstep * 0);
            
#line 390
            npyv_s64 a1 = npyv_loada_s64(data0 + vstep * 1);
            npyv_s64 b1 = npyv_loada_s64(data1 + vstep * 1);
            npyv_s64 c1 = npyv_loada_s64(data_out + vstep * 1);
            
#line 390
            npyv_s64 a2 = npyv_loada_s64(data0 + vstep * 2);
            npyv_s64 b2 = npyv_loada_s64(data1 + vstep * 2);
            npyv_s64 c2 = npyv_loada_s64(data_out + vstep * 2);
            
#line 390
            npyv_s64 a3 = npyv_loada_s64(data0 + vstep * 3);
            npyv_s64 b3 = npyv_loada_s64(data1 + vstep * 3);
            npyv_s64 c3 = npyv_loada_s64(data_out + vstep * 3);
            
            #line 397
            npyv_s64 abc0 = npyv_muladd_s64(a0, b0, c0);
            
#line 397
            npyv_s64 abc1 = npyv_muladd_s64(a1, b1, c1);
            
#line 397
            npyv_s64 abc2 = npyv_muladd_s64(a2, b2, c2);
            
#line 397
            npyv_s64 abc3 = npyv_muladd_s64(a3, b3, c3);
            
            #line 402
            npyv_storea_s64(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_s64(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_s64(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_s64(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s64 a0 = npyv_load_s64(data0 + vstep * 0);
            npyv_s64 b0 = npyv_load_s64(data1 + vstep * 0);
            npyv_s64 c0 = npyv_load_s64(data_out + vstep * 0);
            
#line 390
            npyv_s64 a1 = npyv_load_s64(data0 + vstep * 1);
            npyv_s64 b1 = npyv_load_s64(data1 + vstep * 1);
            npyv_s64 c1 = npyv_load_s64(data_out + vstep * 1);
            
#line 390
            npyv_s64 a2 = npyv_load_s64(data0 + vstep * 2);
            npyv_s64 b2 = npyv_load_s64(data1 + vstep * 2);
            npyv_s64 c2 = npyv_load_s64(data_out + vstep * 2);
            
#line 390
            npyv_s64 a3 = npyv_load_s64(data0 + vstep * 3);
            npyv_s64 b3 = npyv_load_s64(data1 + vstep * 3);
            npyv_s64 c3 = npyv_load_s64(data_out + vstep * 3);
            
            #line 397
            npyv_s64 abc0 = npyv_muladd_s64(a0, b0, c0);
            
#line 397
            npyv_s64 abc1 = npyv_muladd_s64(a1, b1, c1);
            
#line 397
            npyv_s64 abc2 = npyv_muladd_s64(a2, b2, c2);
            
#line 397
            npyv_s64 abc3 = npyv_muladd_s64(a3, b3, c3);
            
            #line 402
            npyv_store_s64(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_s64(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_s64(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_s64(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_s64 a = npyv_load_tillz_s64(data0, count);
        npyv_s64 b = npyv_load_tillz_s64(data1, count);
        npyv_s64 c = npyv_load_tillz_s64(data_out, count);
        npyv_store_till_s64(data_out, count, npyv_muladd_s64(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_longlong a0 = (data0[0]);
        const npy_longlong b0 = (data1[0]);
        const npy_longlong c0 = (data_out[0]);
        
#line 420
        const npy_longlong a1 = (data0[1]);
        const npy_longlong b1 = (data1[1]);
        const npy_longlong c1 = (data_out[1]);
        
#line 420
        const npy_longlong a2 = (data0[2]);
        const npy_longlong b2 = (data1[2]);
        const npy_longlong c2 = (data_out[2]);
        
#line 420
        const npy_longlong a3 = (data0[3]);
        const npy_longlong b3 = (data1[3]);
        const npy_longlong c3 = (data_out[3]);
        
        #line 427
        const npy_longlong abc0 = a0 * b0 + c0;
        
#line 427
        const npy_longlong abc1 = a1 * b1 + c1;
        
#line 427
        const npy_longlong abc2 = a2 * b2 + c2;
        
#line 427
        const npy_longlong abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_longlong a = (*data0);
        const npy_longlong b = (*data1);
        const npy_longlong c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_longlong

}

/* Some extra specializations for the two operand case */
static void
longlong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data_out = (npy_longlong *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    longlong_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
longlong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data_out = (npy_longlong *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    longlong_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
longlong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong accum = 0;

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_longlong
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_s64;
    npyv_s64 vaccum = npyv_zero_s64();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s64 a0 = npyv_loada_s64(data0 + vstep * 0);
            npyv_s64 b0 = npyv_loada_s64(data1 + vstep * 0);
            
#line 501
            npyv_s64 a1 = npyv_loada_s64(data0 + vstep * 1);
            npyv_s64 b1 = npyv_loada_s64(data1 + vstep * 1);
            
#line 501
            npyv_s64 a2 = npyv_loada_s64(data0 + vstep * 2);
            npyv_s64 b2 = npyv_loada_s64(data1 + vstep * 2);
            
#line 501
            npyv_s64 a3 = npyv_loada_s64(data0 + vstep * 3);
            npyv_s64 b3 = npyv_loada_s64(data1 + vstep * 3);
            
            npyv_s64 ab3 = npyv_muladd_s64(a3, b3, vaccum);
            npyv_s64 ab2 = npyv_muladd_s64(a2, b2, ab3);
            npyv_s64 ab1 = npyv_muladd_s64(a1, b1, ab2);
                    vaccum = npyv_muladd_s64(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s64 a0 = npyv_load_s64(data0 + vstep * 0);
            npyv_s64 b0 = npyv_load_s64(data1 + vstep * 0);
            
#line 501
            npyv_s64 a1 = npyv_load_s64(data0 + vstep * 1);
            npyv_s64 b1 = npyv_load_s64(data1 + vstep * 1);
            
#line 501
            npyv_s64 a2 = npyv_load_s64(data0 + vstep * 2);
            npyv_s64 b2 = npyv_load_s64(data1 + vstep * 2);
            
#line 501
            npyv_s64 a3 = npyv_load_s64(data0 + vstep * 3);
            npyv_s64 b3 = npyv_load_s64(data1 + vstep * 3);
            
            npyv_s64 ab3 = npyv_muladd_s64(a3, b3, vaccum);
            npyv_s64 ab2 = npyv_muladd_s64(a2, b2, ab3);
            npyv_s64 ab1 = npyv_muladd_s64(a1, b1, ab2);
                    vaccum = npyv_muladd_s64(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_s64 a = npyv_load_tillz_s64(data0, count);
        npyv_s64 b = npyv_load_tillz_s64(data1, count);
        vaccum = npyv_muladd_s64(a, b, vaccum);
    }
    accum = npyv_sum_s64(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_longlong ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_longlong ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_longlong ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_longlong ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_longlong a = (*data0);
        const npy_longlong b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_longlong
    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
longlong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
    npy_longlong accum = longlong_sum_of_arr(data1, count);
    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
longlong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
    npy_longlong accum = longlong_sum_of_arr(data0, count);
    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + value1 * accum);
}

#elif 1 == 3 && !0

static void
longlong_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data2 = (npy_longlong *)dataptr[2];
    npy_longlong *data_out = (npy_longlong *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1 > 3 || @complex */

static void
longlong_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_longlong *)dataptr[i]);
        }
        *(npy_longlong *)dataptr[nop] = (temp +
                                           (*(npy_longlong *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_longlong);
        }
#else /* complex */
#  if 1 <= 3
#    define _SUMPROD_NOP 1
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_longlong re, im, tmp;
        int i;
        re = ((npy_longlong *)dataptr[0])[0];
        im = ((npy_longlong *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_longlong *)dataptr[i])[0] -
                  im * ((npy_longlong *)dataptr[i])[1];
            im = re * ((npy_longlong *)dataptr[i])[1] +
                 im * ((npy_longlong *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_longlong);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1 */

#if 1 == 1

static NPY_GCC_OPT_3 void
longlong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_longlong *data = (npy_longlong *)dataptr[0];
    npy_longlong accum = longlong_sum_of_arr(data, count);
    *((npy_longlong *)dataptr[1]) = (accum + (*((npy_longlong *)dataptr[1])));
#else
    npy_longlong accum_re = 0, accum_im = 0;
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_longlong re01 = data0[0] + data0[2];
        const npy_longlong re23 = data0[4] + data0[6];
        const npy_longlong im13 = data0[1] + data0[3];
        const npy_longlong im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_longlong *)dataptr[1])[0] += accum_re;
    ((npy_longlong *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 1 == 1 */

static void
longlong_sum_of_products_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_longlong accum_re = 0, accum_im = 0;
#else
    npy_longlong accum = 0;
#endif

#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_outstride0_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        accum += (*(npy_longlong *)data0);
        data0 += stride0;
#  elif 1 == 2
        accum += (*(npy_longlong *)data0) *
                 (*(npy_longlong *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1 == 3
        accum += (*(npy_longlong *)data0) *
                 (*(npy_longlong *)data1) *
                 (*(npy_longlong *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_longlong *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        accum_re += ((npy_longlong *)data0)[0];
        accum_im += ((npy_longlong *)data0)[1];
        data0 += stride0;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_longlong re, im, tmp;
        int i;
        re = ((npy_longlong *)dataptr[0])[0];
        im = ((npy_longlong *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_longlong *)dataptr[i])[0] -
                  im * ((npy_longlong *)dataptr[i])[1];
            im = re * ((npy_longlong *)dataptr[i])[1] +
                 im * ((npy_longlong *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1 <= 3
    ((npy_longlong *)dataptr[1])[0] += accum_re;
    ((npy_longlong *)dataptr[1])[1] += accum_im;
#  else
    ((npy_longlong *)dataptr[nop])[0] += accum_re;
    ((npy_longlong *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1 <= 3
    *((npy_longlong *)dataptr[1]) = (accum +
                                    (*((npy_longlong *)dataptr[1])));
#  else
    *((npy_longlong *)dataptr[nop]) = (accum +
                                    (*((npy_longlong *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
longlong_sum_of_products_two(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (2 == 1) || (2 <= 3 && !0)
    char *data_out = dataptr[2];
    npy_intp stride_out = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_two (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 2 == 2
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
                                         (*(npy_longlong *)data1) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 2 == 3
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
                                         (*(npy_longlong *)data1) *
                                         (*(npy_longlong *)data2) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_longlong *)dataptr[i]);
        }
        *(npy_longlong *)dataptr[nop] = (temp +
                                           (*(npy_longlong *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        ((npy_longlong *)data_out)[0] = ((npy_longlong *)data0)[0] +
                                         ((npy_longlong *)data_out)[0];
        ((npy_longlong *)data_out)[1] = ((npy_longlong *)data0)[1] +
                                         ((npy_longlong *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_longlong re, im, tmp;
        int i;
        re = ((npy_longlong *)dataptr[0])[0];
        im = ((npy_longlong *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_longlong *)dataptr[i])[0] -
                  im * ((npy_longlong *)dataptr[i])[1];
            im = re * ((npy_longlong *)dataptr[i])[1] +
                 im * ((npy_longlong *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 2 == 1

static void
longlong_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data_out = (npy_longlong *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_longlong *)data_out + 2*6)[0] =
                                    ((npy_longlong *)data0 + 2*6)[0] +
                                    ((npy_longlong *)data_out + 2*6)[0];
            ((npy_longlong *)data_out + 2*6)[1] =
                                    ((npy_longlong *)data0 + 2*6)[1] +
                                    ((npy_longlong *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_longlong *)data_out + 2*5)[0] =
                                    ((npy_longlong *)data0 + 2*5)[0] +
                                    ((npy_longlong *)data_out + 2*5)[0];
            ((npy_longlong *)data_out + 2*5)[1] =
                                    ((npy_longlong *)data0 + 2*5)[1] +
                                    ((npy_longlong *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_longlong *)data_out + 2*4)[0] =
                                    ((npy_longlong *)data0 + 2*4)[0] +
                                    ((npy_longlong *)data_out + 2*4)[0];
            ((npy_longlong *)data_out + 2*4)[1] =
                                    ((npy_longlong *)data0 + 2*4)[1] +
                                    ((npy_longlong *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_longlong *)data_out + 2*3)[0] =
                                    ((npy_longlong *)data0 + 2*3)[0] +
                                    ((npy_longlong *)data_out + 2*3)[0];
            ((npy_longlong *)data_out + 2*3)[1] =
                                    ((npy_longlong *)data0 + 2*3)[1] +
                                    ((npy_longlong *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_longlong *)data_out + 2*2)[0] =
                                    ((npy_longlong *)data0 + 2*2)[0] +
                                    ((npy_longlong *)data_out + 2*2)[0];
            ((npy_longlong *)data_out + 2*2)[1] =
                                    ((npy_longlong *)data0 + 2*2)[1] +
                                    ((npy_longlong *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_longlong *)data_out + 2*1)[0] =
                                    ((npy_longlong *)data0 + 2*1)[0] +
                                    ((npy_longlong *)data_out + 2*1)[0];
            ((npy_longlong *)data_out + 2*1)[1] =
                                    ((npy_longlong *)data0 + 2*1)[1] +
                                    ((npy_longlong *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_longlong *)data_out + 2*0)[0] =
                                    ((npy_longlong *)data0 + 2*0)[0] +
                                    ((npy_longlong *)data_out + 2*0)[0];
            ((npy_longlong *)data_out + 2*0)[1] =
                                    ((npy_longlong *)data0 + 2*0)[1] +
                                    ((npy_longlong *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_longlong *)data_out + 2*0)[0] =
                                ((npy_longlong *)data0 + 2*0)[0] +
                                ((npy_longlong *)data_out + 2*0)[0];
        ((npy_longlong *)data_out + 2*0)[1] =
                                ((npy_longlong *)data0 + 2*0)[1] +
                                ((npy_longlong *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_longlong *)data_out + 2*1)[0] =
                                ((npy_longlong *)data0 + 2*1)[0] +
                                ((npy_longlong *)data_out + 2*1)[0];
        ((npy_longlong *)data_out + 2*1)[1] =
                                ((npy_longlong *)data0 + 2*1)[1] +
                                ((npy_longlong *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_longlong *)data_out + 2*2)[0] =
                                ((npy_longlong *)data0 + 2*2)[0] +
                                ((npy_longlong *)data_out + 2*2)[0];
        ((npy_longlong *)data_out + 2*2)[1] =
                                ((npy_longlong *)data0 + 2*2)[1] +
                                ((npy_longlong *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_longlong *)data_out + 2*3)[0] =
                                ((npy_longlong *)data0 + 2*3)[0] +
                                ((npy_longlong *)data_out + 2*3)[0];
        ((npy_longlong *)data_out + 2*3)[1] =
                                ((npy_longlong *)data0 + 2*3)[1] +
                                ((npy_longlong *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_longlong *)data_out + 2*4)[0] =
                                ((npy_longlong *)data0 + 2*4)[0] +
                                ((npy_longlong *)data_out + 2*4)[0];
        ((npy_longlong *)data_out + 2*4)[1] =
                                ((npy_longlong *)data0 + 2*4)[1] +
                                ((npy_longlong *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_longlong *)data_out + 2*5)[0] =
                                ((npy_longlong *)data0 + 2*5)[0] +
                                ((npy_longlong *)data_out + 2*5)[0];
        ((npy_longlong *)data_out + 2*5)[1] =
                                ((npy_longlong *)data0 + 2*5)[1] +
                                ((npy_longlong *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_longlong *)data_out + 2*6)[0] =
                                ((npy_longlong *)data0 + 2*6)[0] +
                                ((npy_longlong *)data_out + 2*6)[0];
        ((npy_longlong *)data_out + 2*6)[1] =
                                ((npy_longlong *)data0 + 2*6)[1] +
                                ((npy_longlong *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_longlong *)data_out + 2*7)[0] =
                                ((npy_longlong *)data0 + 2*7)[0] +
                                ((npy_longlong *)data_out + 2*7)[0];
        ((npy_longlong *)data_out + 2*7)[1] =
                                ((npy_longlong *)data0 + 2*7)[1] +
                                ((npy_longlong *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 2 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
longlong_sum_of_products_muladd(npy_longlong *data, npy_longlong *data_out, npy_longlong scalar, npy_intp count)
{
#if 0 // NPYV check for npy_longlong
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s64;
    const npyv_s64 v_scalar = npyv_setall_s64(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s64 b0 = npyv_loada_s64(data + vstep * 0);
            npyv_s64 c0 = npyv_loada_s64(data_out + vstep * 0);
            
#line 312
            npyv_s64 b1 = npyv_loada_s64(data + vstep * 1);
            npyv_s64 c1 = npyv_loada_s64(data_out + vstep * 1);
            
#line 312
            npyv_s64 b2 = npyv_loada_s64(data + vstep * 2);
            npyv_s64 c2 = npyv_loada_s64(data_out + vstep * 2);
            
#line 312
            npyv_s64 b3 = npyv_loada_s64(data + vstep * 3);
            npyv_s64 c3 = npyv_loada_s64(data_out + vstep * 3);
            
            #line 318
            npyv_s64 abc0 = npyv_muladd_s64(v_scalar, b0, c0);
            
#line 318
            npyv_s64 abc1 = npyv_muladd_s64(v_scalar, b1, c1);
            
#line 318
            npyv_s64 abc2 = npyv_muladd_s64(v_scalar, b2, c2);
            
#line 318
            npyv_s64 abc3 = npyv_muladd_s64(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_s64(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_s64(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_s64(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_s64(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s64 b0 = npyv_load_s64(data + vstep * 0);
            npyv_s64 c0 = npyv_load_s64(data_out + vstep * 0);
            
#line 312
            npyv_s64 b1 = npyv_load_s64(data + vstep * 1);
            npyv_s64 c1 = npyv_load_s64(data_out + vstep * 1);
            
#line 312
            npyv_s64 b2 = npyv_load_s64(data + vstep * 2);
            npyv_s64 c2 = npyv_load_s64(data_out + vstep * 2);
            
#line 312
            npyv_s64 b3 = npyv_load_s64(data + vstep * 3);
            npyv_s64 c3 = npyv_load_s64(data_out + vstep * 3);
            
            #line 318
            npyv_s64 abc0 = npyv_muladd_s64(v_scalar, b0, c0);
            
#line 318
            npyv_s64 abc1 = npyv_muladd_s64(v_scalar, b1, c1);
            
#line 318
            npyv_s64 abc2 = npyv_muladd_s64(v_scalar, b2, c2);
            
#line 318
            npyv_s64 abc3 = npyv_muladd_s64(v_scalar, b3, c3);
            
            #line 323
            npyv_store_s64(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_s64(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_s64(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_s64(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_s64 a = npyv_load_tillz_s64(data, count);
        npyv_s64 b = npyv_load_tillz_s64(data_out, count);
        npyv_store_till_s64(data_out, count, npyv_muladd_s64(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_longlong b0 = (data[0]);
        const npy_longlong c0 = (data_out[0]);
        
#line 340
        const npy_longlong b1 = (data[1]);
        const npy_longlong c1 = (data_out[1]);
        
#line 340
        const npy_longlong b2 = (data[2]);
        const npy_longlong c2 = (data_out[2]);
        
#line 340
        const npy_longlong b3 = (data[3]);
        const npy_longlong c3 = (data_out[3]);
        
        #line 346
        const npy_longlong abc0 = scalar * b0 + c0;
        
#line 346
        const npy_longlong abc1 = scalar * b1 + c1;
        
#line 346
        const npy_longlong abc2 = scalar * b2 + c2;
        
#line 346
        const npy_longlong abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_longlong b = (*data);
        const npy_longlong c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_longlong
}

static void
longlong_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data_out = (npy_longlong *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_longlong
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s64;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s64 a0 = npyv_loada_s64(data0 + vstep * 0);
            npyv_s64 b0 = npyv_loada_s64(data1 + vstep * 0);
            npyv_s64 c0 = npyv_loada_s64(data_out + vstep * 0);
            
#line 390
            npyv_s64 a1 = npyv_loada_s64(data0 + vstep * 1);
            npyv_s64 b1 = npyv_loada_s64(data1 + vstep * 1);
            npyv_s64 c1 = npyv_loada_s64(data_out + vstep * 1);
            
#line 390
            npyv_s64 a2 = npyv_loada_s64(data0 + vstep * 2);
            npyv_s64 b2 = npyv_loada_s64(data1 + vstep * 2);
            npyv_s64 c2 = npyv_loada_s64(data_out + vstep * 2);
            
#line 390
            npyv_s64 a3 = npyv_loada_s64(data0 + vstep * 3);
            npyv_s64 b3 = npyv_loada_s64(data1 + vstep * 3);
            npyv_s64 c3 = npyv_loada_s64(data_out + vstep * 3);
            
            #line 397
            npyv_s64 abc0 = npyv_muladd_s64(a0, b0, c0);
            
#line 397
            npyv_s64 abc1 = npyv_muladd_s64(a1, b1, c1);
            
#line 397
            npyv_s64 abc2 = npyv_muladd_s64(a2, b2, c2);
            
#line 397
            npyv_s64 abc3 = npyv_muladd_s64(a3, b3, c3);
            
            #line 402
            npyv_storea_s64(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_s64(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_s64(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_s64(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s64 a0 = npyv_load_s64(data0 + vstep * 0);
            npyv_s64 b0 = npyv_load_s64(data1 + vstep * 0);
            npyv_s64 c0 = npyv_load_s64(data_out + vstep * 0);
            
#line 390
            npyv_s64 a1 = npyv_load_s64(data0 + vstep * 1);
            npyv_s64 b1 = npyv_load_s64(data1 + vstep * 1);
            npyv_s64 c1 = npyv_load_s64(data_out + vstep * 1);
            
#line 390
            npyv_s64 a2 = npyv_load_s64(data0 + vstep * 2);
            npyv_s64 b2 = npyv_load_s64(data1 + vstep * 2);
            npyv_s64 c2 = npyv_load_s64(data_out + vstep * 2);
            
#line 390
            npyv_s64 a3 = npyv_load_s64(data0 + vstep * 3);
            npyv_s64 b3 = npyv_load_s64(data1 + vstep * 3);
            npyv_s64 c3 = npyv_load_s64(data_out + vstep * 3);
            
            #line 397
            npyv_s64 abc0 = npyv_muladd_s64(a0, b0, c0);
            
#line 397
            npyv_s64 abc1 = npyv_muladd_s64(a1, b1, c1);
            
#line 397
            npyv_s64 abc2 = npyv_muladd_s64(a2, b2, c2);
            
#line 397
            npyv_s64 abc3 = npyv_muladd_s64(a3, b3, c3);
            
            #line 402
            npyv_store_s64(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_s64(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_s64(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_s64(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_s64 a = npyv_load_tillz_s64(data0, count);
        npyv_s64 b = npyv_load_tillz_s64(data1, count);
        npyv_s64 c = npyv_load_tillz_s64(data_out, count);
        npyv_store_till_s64(data_out, count, npyv_muladd_s64(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_longlong a0 = (data0[0]);
        const npy_longlong b0 = (data1[0]);
        const npy_longlong c0 = (data_out[0]);
        
#line 420
        const npy_longlong a1 = (data0[1]);
        const npy_longlong b1 = (data1[1]);
        const npy_longlong c1 = (data_out[1]);
        
#line 420
        const npy_longlong a2 = (data0[2]);
        const npy_longlong b2 = (data1[2]);
        const npy_longlong c2 = (data_out[2]);
        
#line 420
        const npy_longlong a3 = (data0[3]);
        const npy_longlong b3 = (data1[3]);
        const npy_longlong c3 = (data_out[3]);
        
        #line 427
        const npy_longlong abc0 = a0 * b0 + c0;
        
#line 427
        const npy_longlong abc1 = a1 * b1 + c1;
        
#line 427
        const npy_longlong abc2 = a2 * b2 + c2;
        
#line 427
        const npy_longlong abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_longlong a = (*data0);
        const npy_longlong b = (*data1);
        const npy_longlong c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_longlong

}

/* Some extra specializations for the two operand case */
static void
longlong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data_out = (npy_longlong *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    longlong_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
longlong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data_out = (npy_longlong *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    longlong_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
longlong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong accum = 0;

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_longlong
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_s64;
    npyv_s64 vaccum = npyv_zero_s64();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s64 a0 = npyv_loada_s64(data0 + vstep * 0);
            npyv_s64 b0 = npyv_loada_s64(data1 + vstep * 0);
            
#line 501
            npyv_s64 a1 = npyv_loada_s64(data0 + vstep * 1);
            npyv_s64 b1 = npyv_loada_s64(data1 + vstep * 1);
            
#line 501
            npyv_s64 a2 = npyv_loada_s64(data0 + vstep * 2);
            npyv_s64 b2 = npyv_loada_s64(data1 + vstep * 2);
            
#line 501
            npyv_s64 a3 = npyv_loada_s64(data0 + vstep * 3);
            npyv_s64 b3 = npyv_loada_s64(data1 + vstep * 3);
            
            npyv_s64 ab3 = npyv_muladd_s64(a3, b3, vaccum);
            npyv_s64 ab2 = npyv_muladd_s64(a2, b2, ab3);
            npyv_s64 ab1 = npyv_muladd_s64(a1, b1, ab2);
                    vaccum = npyv_muladd_s64(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s64 a0 = npyv_load_s64(data0 + vstep * 0);
            npyv_s64 b0 = npyv_load_s64(data1 + vstep * 0);
            
#line 501
            npyv_s64 a1 = npyv_load_s64(data0 + vstep * 1);
            npyv_s64 b1 = npyv_load_s64(data1 + vstep * 1);
            
#line 501
            npyv_s64 a2 = npyv_load_s64(data0 + vstep * 2);
            npyv_s64 b2 = npyv_load_s64(data1 + vstep * 2);
            
#line 501
            npyv_s64 a3 = npyv_load_s64(data0 + vstep * 3);
            npyv_s64 b3 = npyv_load_s64(data1 + vstep * 3);
            
            npyv_s64 ab3 = npyv_muladd_s64(a3, b3, vaccum);
            npyv_s64 ab2 = npyv_muladd_s64(a2, b2, ab3);
            npyv_s64 ab1 = npyv_muladd_s64(a1, b1, ab2);
                    vaccum = npyv_muladd_s64(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_s64 a = npyv_load_tillz_s64(data0, count);
        npyv_s64 b = npyv_load_tillz_s64(data1, count);
        vaccum = npyv_muladd_s64(a, b, vaccum);
    }
    accum = npyv_sum_s64(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_longlong ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_longlong ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_longlong ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_longlong ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_longlong a = (*data0);
        const npy_longlong b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_longlong
    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
longlong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
    npy_longlong accum = longlong_sum_of_arr(data1, count);
    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
longlong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
    npy_longlong accum = longlong_sum_of_arr(data0, count);
    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + value1 * accum);
}

#elif 2 == 3 && !0

static void
longlong_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data2 = (npy_longlong *)dataptr[2];
    npy_longlong *data_out = (npy_longlong *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 2 > 3 || @complex */

static void
longlong_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_longlong *)dataptr[i]);
        }
        *(npy_longlong *)dataptr[nop] = (temp +
                                           (*(npy_longlong *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_longlong);
        }
#else /* complex */
#  if 2 <= 3
#    define _SUMPROD_NOP 2
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_longlong re, im, tmp;
        int i;
        re = ((npy_longlong *)dataptr[0])[0];
        im = ((npy_longlong *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_longlong *)dataptr[i])[0] -
                  im * ((npy_longlong *)dataptr[i])[1];
            im = re * ((npy_longlong *)dataptr[i])[1] +
                 im * ((npy_longlong *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_longlong);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 2 */

#if 2 == 1

static NPY_GCC_OPT_3 void
longlong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_longlong *data = (npy_longlong *)dataptr[0];
    npy_longlong accum = longlong_sum_of_arr(data, count);
    *((npy_longlong *)dataptr[1]) = (accum + (*((npy_longlong *)dataptr[1])));
#else
    npy_longlong accum_re = 0, accum_im = 0;
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_longlong re01 = data0[0] + data0[2];
        const npy_longlong re23 = data0[4] + data0[6];
        const npy_longlong im13 = data0[1] + data0[3];
        const npy_longlong im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_longlong *)dataptr[1])[0] += accum_re;
    ((npy_longlong *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 2 == 1 */

static void
longlong_sum_of_products_outstride0_two(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_longlong accum_re = 0, accum_im = 0;
#else
    npy_longlong accum = 0;
#endif

#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_outstride0_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        accum += (*(npy_longlong *)data0);
        data0 += stride0;
#  elif 2 == 2
        accum += (*(npy_longlong *)data0) *
                 (*(npy_longlong *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 2 == 3
        accum += (*(npy_longlong *)data0) *
                 (*(npy_longlong *)data1) *
                 (*(npy_longlong *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_longlong *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        accum_re += ((npy_longlong *)data0)[0];
        accum_im += ((npy_longlong *)data0)[1];
        data0 += stride0;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_longlong re, im, tmp;
        int i;
        re = ((npy_longlong *)dataptr[0])[0];
        im = ((npy_longlong *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_longlong *)dataptr[i])[0] -
                  im * ((npy_longlong *)dataptr[i])[1];
            im = re * ((npy_longlong *)dataptr[i])[1] +
                 im * ((npy_longlong *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 2 <= 3
    ((npy_longlong *)dataptr[2])[0] += accum_re;
    ((npy_longlong *)dataptr[2])[1] += accum_im;
#  else
    ((npy_longlong *)dataptr[nop])[0] += accum_re;
    ((npy_longlong *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 2 <= 3
    *((npy_longlong *)dataptr[2]) = (accum +
                                    (*((npy_longlong *)dataptr[2])));
#  else
    *((npy_longlong *)dataptr[nop]) = (accum +
                                    (*((npy_longlong *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
longlong_sum_of_products_three(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (3 == 1) || (3 <= 3 && !0)
    char *data_out = dataptr[3];
    npy_intp stride_out = strides[3];
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_three (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 3 == 2
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
                                         (*(npy_longlong *)data1) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 3 == 3
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
                                         (*(npy_longlong *)data1) *
                                         (*(npy_longlong *)data2) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_longlong *)dataptr[i]);
        }
        *(npy_longlong *)dataptr[nop] = (temp +
                                           (*(npy_longlong *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        ((npy_longlong *)data_out)[0] = ((npy_longlong *)data0)[0] +
                                         ((npy_longlong *)data_out)[0];
        ((npy_longlong *)data_out)[1] = ((npy_longlong *)data0)[1] +
                                         ((npy_longlong *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_longlong re, im, tmp;
        int i;
        re = ((npy_longlong *)dataptr[0])[0];
        im = ((npy_longlong *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_longlong *)dataptr[i])[0] -
                  im * ((npy_longlong *)dataptr[i])[1];
            im = re * ((npy_longlong *)dataptr[i])[1] +
                 im * ((npy_longlong *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 3 == 1

static void
longlong_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data_out = (npy_longlong *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_longlong *)data_out + 2*6)[0] =
                                    ((npy_longlong *)data0 + 2*6)[0] +
                                    ((npy_longlong *)data_out + 2*6)[0];
            ((npy_longlong *)data_out + 2*6)[1] =
                                    ((npy_longlong *)data0 + 2*6)[1] +
                                    ((npy_longlong *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_longlong *)data_out + 2*5)[0] =
                                    ((npy_longlong *)data0 + 2*5)[0] +
                                    ((npy_longlong *)data_out + 2*5)[0];
            ((npy_longlong *)data_out + 2*5)[1] =
                                    ((npy_longlong *)data0 + 2*5)[1] +
                                    ((npy_longlong *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_longlong *)data_out + 2*4)[0] =
                                    ((npy_longlong *)data0 + 2*4)[0] +
                                    ((npy_longlong *)data_out + 2*4)[0];
            ((npy_longlong *)data_out + 2*4)[1] =
                                    ((npy_longlong *)data0 + 2*4)[1] +
                                    ((npy_longlong *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_longlong *)data_out + 2*3)[0] =
                                    ((npy_longlong *)data0 + 2*3)[0] +
                                    ((npy_longlong *)data_out + 2*3)[0];
            ((npy_longlong *)data_out + 2*3)[1] =
                                    ((npy_longlong *)data0 + 2*3)[1] +
                                    ((npy_longlong *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_longlong *)data_out + 2*2)[0] =
                                    ((npy_longlong *)data0 + 2*2)[0] +
                                    ((npy_longlong *)data_out + 2*2)[0];
            ((npy_longlong *)data_out + 2*2)[1] =
                                    ((npy_longlong *)data0 + 2*2)[1] +
                                    ((npy_longlong *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_longlong *)data_out + 2*1)[0] =
                                    ((npy_longlong *)data0 + 2*1)[0] +
                                    ((npy_longlong *)data_out + 2*1)[0];
            ((npy_longlong *)data_out + 2*1)[1] =
                                    ((npy_longlong *)data0 + 2*1)[1] +
                                    ((npy_longlong *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_longlong *)data_out + 2*0)[0] =
                                    ((npy_longlong *)data0 + 2*0)[0] +
                                    ((npy_longlong *)data_out + 2*0)[0];
            ((npy_longlong *)data_out + 2*0)[1] =
                                    ((npy_longlong *)data0 + 2*0)[1] +
                                    ((npy_longlong *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_longlong *)data_out + 2*0)[0] =
                                ((npy_longlong *)data0 + 2*0)[0] +
                                ((npy_longlong *)data_out + 2*0)[0];
        ((npy_longlong *)data_out + 2*0)[1] =
                                ((npy_longlong *)data0 + 2*0)[1] +
                                ((npy_longlong *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_longlong *)data_out + 2*1)[0] =
                                ((npy_longlong *)data0 + 2*1)[0] +
                                ((npy_longlong *)data_out + 2*1)[0];
        ((npy_longlong *)data_out + 2*1)[1] =
                                ((npy_longlong *)data0 + 2*1)[1] +
                                ((npy_longlong *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_longlong *)data_out + 2*2)[0] =
                                ((npy_longlong *)data0 + 2*2)[0] +
                                ((npy_longlong *)data_out + 2*2)[0];
        ((npy_longlong *)data_out + 2*2)[1] =
                                ((npy_longlong *)data0 + 2*2)[1] +
                                ((npy_longlong *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_longlong *)data_out + 2*3)[0] =
                                ((npy_longlong *)data0 + 2*3)[0] +
                                ((npy_longlong *)data_out + 2*3)[0];
        ((npy_longlong *)data_out + 2*3)[1] =
                                ((npy_longlong *)data0 + 2*3)[1] +
                                ((npy_longlong *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_longlong *)data_out + 2*4)[0] =
                                ((npy_longlong *)data0 + 2*4)[0] +
                                ((npy_longlong *)data_out + 2*4)[0];
        ((npy_longlong *)data_out + 2*4)[1] =
                                ((npy_longlong *)data0 + 2*4)[1] +
                                ((npy_longlong *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_longlong *)data_out + 2*5)[0] =
                                ((npy_longlong *)data0 + 2*5)[0] +
                                ((npy_longlong *)data_out + 2*5)[0];
        ((npy_longlong *)data_out + 2*5)[1] =
                                ((npy_longlong *)data0 + 2*5)[1] +
                                ((npy_longlong *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_longlong *)data_out + 2*6)[0] =
                                ((npy_longlong *)data0 + 2*6)[0] +
                                ((npy_longlong *)data_out + 2*6)[0];
        ((npy_longlong *)data_out + 2*6)[1] =
                                ((npy_longlong *)data0 + 2*6)[1] +
                                ((npy_longlong *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_longlong *)data_out + 2*7)[0] =
                                ((npy_longlong *)data0 + 2*7)[0] +
                                ((npy_longlong *)data_out + 2*7)[0];
        ((npy_longlong *)data_out + 2*7)[1] =
                                ((npy_longlong *)data0 + 2*7)[1] +
                                ((npy_longlong *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 3 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
longlong_sum_of_products_muladd(npy_longlong *data, npy_longlong *data_out, npy_longlong scalar, npy_intp count)
{
#if 0 // NPYV check for npy_longlong
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s64;
    const npyv_s64 v_scalar = npyv_setall_s64(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s64 b0 = npyv_loada_s64(data + vstep * 0);
            npyv_s64 c0 = npyv_loada_s64(data_out + vstep * 0);
            
#line 312
            npyv_s64 b1 = npyv_loada_s64(data + vstep * 1);
            npyv_s64 c1 = npyv_loada_s64(data_out + vstep * 1);
            
#line 312
            npyv_s64 b2 = npyv_loada_s64(data + vstep * 2);
            npyv_s64 c2 = npyv_loada_s64(data_out + vstep * 2);
            
#line 312
            npyv_s64 b3 = npyv_loada_s64(data + vstep * 3);
            npyv_s64 c3 = npyv_loada_s64(data_out + vstep * 3);
            
            #line 318
            npyv_s64 abc0 = npyv_muladd_s64(v_scalar, b0, c0);
            
#line 318
            npyv_s64 abc1 = npyv_muladd_s64(v_scalar, b1, c1);
            
#line 318
            npyv_s64 abc2 = npyv_muladd_s64(v_scalar, b2, c2);
            
#line 318
            npyv_s64 abc3 = npyv_muladd_s64(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_s64(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_s64(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_s64(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_s64(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s64 b0 = npyv_load_s64(data + vstep * 0);
            npyv_s64 c0 = npyv_load_s64(data_out + vstep * 0);
            
#line 312
            npyv_s64 b1 = npyv_load_s64(data + vstep * 1);
            npyv_s64 c1 = npyv_load_s64(data_out + vstep * 1);
            
#line 312
            npyv_s64 b2 = npyv_load_s64(data + vstep * 2);
            npyv_s64 c2 = npyv_load_s64(data_out + vstep * 2);
            
#line 312
            npyv_s64 b3 = npyv_load_s64(data + vstep * 3);
            npyv_s64 c3 = npyv_load_s64(data_out + vstep * 3);
            
            #line 318
            npyv_s64 abc0 = npyv_muladd_s64(v_scalar, b0, c0);
            
#line 318
            npyv_s64 abc1 = npyv_muladd_s64(v_scalar, b1, c1);
            
#line 318
            npyv_s64 abc2 = npyv_muladd_s64(v_scalar, b2, c2);
            
#line 318
            npyv_s64 abc3 = npyv_muladd_s64(v_scalar, b3, c3);
            
            #line 323
            npyv_store_s64(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_s64(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_s64(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_s64(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_s64 a = npyv_load_tillz_s64(data, count);
        npyv_s64 b = npyv_load_tillz_s64(data_out, count);
        npyv_store_till_s64(data_out, count, npyv_muladd_s64(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_longlong b0 = (data[0]);
        const npy_longlong c0 = (data_out[0]);
        
#line 340
        const npy_longlong b1 = (data[1]);
        const npy_longlong c1 = (data_out[1]);
        
#line 340
        const npy_longlong b2 = (data[2]);
        const npy_longlong c2 = (data_out[2]);
        
#line 340
        const npy_longlong b3 = (data[3]);
        const npy_longlong c3 = (data_out[3]);
        
        #line 346
        const npy_longlong abc0 = scalar * b0 + c0;
        
#line 346
        const npy_longlong abc1 = scalar * b1 + c1;
        
#line 346
        const npy_longlong abc2 = scalar * b2 + c2;
        
#line 346
        const npy_longlong abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_longlong b = (*data);
        const npy_longlong c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_longlong
}

static void
longlong_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data_out = (npy_longlong *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_longlong
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s64;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s64 a0 = npyv_loada_s64(data0 + vstep * 0);
            npyv_s64 b0 = npyv_loada_s64(data1 + vstep * 0);
            npyv_s64 c0 = npyv_loada_s64(data_out + vstep * 0);
            
#line 390
            npyv_s64 a1 = npyv_loada_s64(data0 + vstep * 1);
            npyv_s64 b1 = npyv_loada_s64(data1 + vstep * 1);
            npyv_s64 c1 = npyv_loada_s64(data_out + vstep * 1);
            
#line 390
            npyv_s64 a2 = npyv_loada_s64(data0 + vstep * 2);
            npyv_s64 b2 = npyv_loada_s64(data1 + vstep * 2);
            npyv_s64 c2 = npyv_loada_s64(data_out + vstep * 2);
            
#line 390
            npyv_s64 a3 = npyv_loada_s64(data0 + vstep * 3);
            npyv_s64 b3 = npyv_loada_s64(data1 + vstep * 3);
            npyv_s64 c3 = npyv_loada_s64(data_out + vstep * 3);
            
            #line 397
            npyv_s64 abc0 = npyv_muladd_s64(a0, b0, c0);
            
#line 397
            npyv_s64 abc1 = npyv_muladd_s64(a1, b1, c1);
            
#line 397
            npyv_s64 abc2 = npyv_muladd_s64(a2, b2, c2);
            
#line 397
            npyv_s64 abc3 = npyv_muladd_s64(a3, b3, c3);
            
            #line 402
            npyv_storea_s64(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_s64(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_s64(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_s64(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s64 a0 = npyv_load_s64(data0 + vstep * 0);
            npyv_s64 b0 = npyv_load_s64(data1 + vstep * 0);
            npyv_s64 c0 = npyv_load_s64(data_out + vstep * 0);
            
#line 390
            npyv_s64 a1 = npyv_load_s64(data0 + vstep * 1);
            npyv_s64 b1 = npyv_load_s64(data1 + vstep * 1);
            npyv_s64 c1 = npyv_load_s64(data_out + vstep * 1);
            
#line 390
            npyv_s64 a2 = npyv_load_s64(data0 + vstep * 2);
            npyv_s64 b2 = npyv_load_s64(data1 + vstep * 2);
            npyv_s64 c2 = npyv_load_s64(data_out + vstep * 2);
            
#line 390
            npyv_s64 a3 = npyv_load_s64(data0 + vstep * 3);
            npyv_s64 b3 = npyv_load_s64(data1 + vstep * 3);
            npyv_s64 c3 = npyv_load_s64(data_out + vstep * 3);
            
            #line 397
            npyv_s64 abc0 = npyv_muladd_s64(a0, b0, c0);
            
#line 397
            npyv_s64 abc1 = npyv_muladd_s64(a1, b1, c1);
            
#line 397
            npyv_s64 abc2 = npyv_muladd_s64(a2, b2, c2);
            
#line 397
            npyv_s64 abc3 = npyv_muladd_s64(a3, b3, c3);
            
            #line 402
            npyv_store_s64(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_s64(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_s64(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_s64(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_s64 a = npyv_load_tillz_s64(data0, count);
        npyv_s64 b = npyv_load_tillz_s64(data1, count);
        npyv_s64 c = npyv_load_tillz_s64(data_out, count);
        npyv_store_till_s64(data_out, count, npyv_muladd_s64(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_longlong a0 = (data0[0]);
        const npy_longlong b0 = (data1[0]);
        const npy_longlong c0 = (data_out[0]);
        
#line 420
        const npy_longlong a1 = (data0[1]);
        const npy_longlong b1 = (data1[1]);
        const npy_longlong c1 = (data_out[1]);
        
#line 420
        const npy_longlong a2 = (data0[2]);
        const npy_longlong b2 = (data1[2]);
        const npy_longlong c2 = (data_out[2]);
        
#line 420
        const npy_longlong a3 = (data0[3]);
        const npy_longlong b3 = (data1[3]);
        const npy_longlong c3 = (data_out[3]);
        
        #line 427
        const npy_longlong abc0 = a0 * b0 + c0;
        
#line 427
        const npy_longlong abc1 = a1 * b1 + c1;
        
#line 427
        const npy_longlong abc2 = a2 * b2 + c2;
        
#line 427
        const npy_longlong abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_longlong a = (*data0);
        const npy_longlong b = (*data1);
        const npy_longlong c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_longlong

}

/* Some extra specializations for the two operand case */
static void
longlong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data_out = (npy_longlong *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    longlong_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
longlong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data_out = (npy_longlong *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    longlong_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
longlong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong accum = 0;

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_longlong
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_s64;
    npyv_s64 vaccum = npyv_zero_s64();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s64 a0 = npyv_loada_s64(data0 + vstep * 0);
            npyv_s64 b0 = npyv_loada_s64(data1 + vstep * 0);
            
#line 501
            npyv_s64 a1 = npyv_loada_s64(data0 + vstep * 1);
            npyv_s64 b1 = npyv_loada_s64(data1 + vstep * 1);
            
#line 501
            npyv_s64 a2 = npyv_loada_s64(data0 + vstep * 2);
            npyv_s64 b2 = npyv_loada_s64(data1 + vstep * 2);
            
#line 501
            npyv_s64 a3 = npyv_loada_s64(data0 + vstep * 3);
            npyv_s64 b3 = npyv_loada_s64(data1 + vstep * 3);
            
            npyv_s64 ab3 = npyv_muladd_s64(a3, b3, vaccum);
            npyv_s64 ab2 = npyv_muladd_s64(a2, b2, ab3);
            npyv_s64 ab1 = npyv_muladd_s64(a1, b1, ab2);
                    vaccum = npyv_muladd_s64(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s64 a0 = npyv_load_s64(data0 + vstep * 0);
            npyv_s64 b0 = npyv_load_s64(data1 + vstep * 0);
            
#line 501
            npyv_s64 a1 = npyv_load_s64(data0 + vstep * 1);
            npyv_s64 b1 = npyv_load_s64(data1 + vstep * 1);
            
#line 501
            npyv_s64 a2 = npyv_load_s64(data0 + vstep * 2);
            npyv_s64 b2 = npyv_load_s64(data1 + vstep * 2);
            
#line 501
            npyv_s64 a3 = npyv_load_s64(data0 + vstep * 3);
            npyv_s64 b3 = npyv_load_s64(data1 + vstep * 3);
            
            npyv_s64 ab3 = npyv_muladd_s64(a3, b3, vaccum);
            npyv_s64 ab2 = npyv_muladd_s64(a2, b2, ab3);
            npyv_s64 ab1 = npyv_muladd_s64(a1, b1, ab2);
                    vaccum = npyv_muladd_s64(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_s64 a = npyv_load_tillz_s64(data0, count);
        npyv_s64 b = npyv_load_tillz_s64(data1, count);
        vaccum = npyv_muladd_s64(a, b, vaccum);
    }
    accum = npyv_sum_s64(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_longlong ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_longlong ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_longlong ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_longlong ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_longlong a = (*data0);
        const npy_longlong b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_longlong
    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
longlong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
    npy_longlong accum = longlong_sum_of_arr(data1, count);
    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
longlong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
    npy_longlong accum = longlong_sum_of_arr(data0, count);
    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + value1 * accum);
}

#elif 3 == 3 && !0

static void
longlong_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data2 = (npy_longlong *)dataptr[2];
    npy_longlong *data_out = (npy_longlong *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 3 > 3 || @complex */

static void
longlong_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_longlong *)dataptr[i]);
        }
        *(npy_longlong *)dataptr[nop] = (temp +
                                           (*(npy_longlong *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_longlong);
        }
#else /* complex */
#  if 3 <= 3
#    define _SUMPROD_NOP 3
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_longlong re, im, tmp;
        int i;
        re = ((npy_longlong *)dataptr[0])[0];
        im = ((npy_longlong *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_longlong *)dataptr[i])[0] -
                  im * ((npy_longlong *)dataptr[i])[1];
            im = re * ((npy_longlong *)dataptr[i])[1] +
                 im * ((npy_longlong *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_longlong);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 3 */

#if 3 == 1

static NPY_GCC_OPT_3 void
longlong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_longlong *data = (npy_longlong *)dataptr[0];
    npy_longlong accum = longlong_sum_of_arr(data, count);
    *((npy_longlong *)dataptr[1]) = (accum + (*((npy_longlong *)dataptr[1])));
#else
    npy_longlong accum_re = 0, accum_im = 0;
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_longlong re01 = data0[0] + data0[2];
        const npy_longlong re23 = data0[4] + data0[6];
        const npy_longlong im13 = data0[1] + data0[3];
        const npy_longlong im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_longlong *)dataptr[1])[0] += accum_re;
    ((npy_longlong *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 3 == 1 */

static void
longlong_sum_of_products_outstride0_three(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_longlong accum_re = 0, accum_im = 0;
#else
    npy_longlong accum = 0;
#endif

#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_outstride0_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        accum += (*(npy_longlong *)data0);
        data0 += stride0;
#  elif 3 == 2
        accum += (*(npy_longlong *)data0) *
                 (*(npy_longlong *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 3 == 3
        accum += (*(npy_longlong *)data0) *
                 (*(npy_longlong *)data1) *
                 (*(npy_longlong *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_longlong *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        accum_re += ((npy_longlong *)data0)[0];
        accum_im += ((npy_longlong *)data0)[1];
        data0 += stride0;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_longlong re, im, tmp;
        int i;
        re = ((npy_longlong *)dataptr[0])[0];
        im = ((npy_longlong *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_longlong *)dataptr[i])[0] -
                  im * ((npy_longlong *)dataptr[i])[1];
            im = re * ((npy_longlong *)dataptr[i])[1] +
                 im * ((npy_longlong *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 3 <= 3
    ((npy_longlong *)dataptr[3])[0] += accum_re;
    ((npy_longlong *)dataptr[3])[1] += accum_im;
#  else
    ((npy_longlong *)dataptr[nop])[0] += accum_re;
    ((npy_longlong *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 3 <= 3
    *((npy_longlong *)dataptr[3]) = (accum +
                                    (*((npy_longlong *)dataptr[3])));
#  else
    *((npy_longlong *)dataptr[nop]) = (accum +
                                    (*((npy_longlong *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
longlong_sum_of_products_any(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data_out = dataptr[1000];
    npy_intp stride_out = strides[1000];
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_any (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1000 == 2
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
                                         (*(npy_longlong *)data1) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1000 == 3
        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
                                         (*(npy_longlong *)data1) *
                                         (*(npy_longlong *)data2) +
                                         (*(npy_longlong *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_longlong *)dataptr[i]);
        }
        *(npy_longlong *)dataptr[nop] = (temp +
                                           (*(npy_longlong *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        ((npy_longlong *)data_out)[0] = ((npy_longlong *)data0)[0] +
                                         ((npy_longlong *)data_out)[0];
        ((npy_longlong *)data_out)[1] = ((npy_longlong *)data0)[1] +
                                         ((npy_longlong *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_longlong re, im, tmp;
        int i;
        re = ((npy_longlong *)dataptr[0])[0];
        im = ((npy_longlong *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_longlong *)dataptr[i])[0] -
                  im * ((npy_longlong *)dataptr[i])[1];
            im = re * ((npy_longlong *)dataptr[i])[1] +
                 im * ((npy_longlong *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1000 == 1

static void
longlong_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data_out = (npy_longlong *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_longlong *)data_out + 2*6)[0] =
                                    ((npy_longlong *)data0 + 2*6)[0] +
                                    ((npy_longlong *)data_out + 2*6)[0];
            ((npy_longlong *)data_out + 2*6)[1] =
                                    ((npy_longlong *)data0 + 2*6)[1] +
                                    ((npy_longlong *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_longlong *)data_out + 2*5)[0] =
                                    ((npy_longlong *)data0 + 2*5)[0] +
                                    ((npy_longlong *)data_out + 2*5)[0];
            ((npy_longlong *)data_out + 2*5)[1] =
                                    ((npy_longlong *)data0 + 2*5)[1] +
                                    ((npy_longlong *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_longlong *)data_out + 2*4)[0] =
                                    ((npy_longlong *)data0 + 2*4)[0] +
                                    ((npy_longlong *)data_out + 2*4)[0];
            ((npy_longlong *)data_out + 2*4)[1] =
                                    ((npy_longlong *)data0 + 2*4)[1] +
                                    ((npy_longlong *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_longlong *)data_out + 2*3)[0] =
                                    ((npy_longlong *)data0 + 2*3)[0] +
                                    ((npy_longlong *)data_out + 2*3)[0];
            ((npy_longlong *)data_out + 2*3)[1] =
                                    ((npy_longlong *)data0 + 2*3)[1] +
                                    ((npy_longlong *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_longlong *)data_out + 2*2)[0] =
                                    ((npy_longlong *)data0 + 2*2)[0] +
                                    ((npy_longlong *)data_out + 2*2)[0];
            ((npy_longlong *)data_out + 2*2)[1] =
                                    ((npy_longlong *)data0 + 2*2)[1] +
                                    ((npy_longlong *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_longlong *)data_out + 2*1)[0] =
                                    ((npy_longlong *)data0 + 2*1)[0] +
                                    ((npy_longlong *)data_out + 2*1)[0];
            ((npy_longlong *)data_out + 2*1)[1] =
                                    ((npy_longlong *)data0 + 2*1)[1] +
                                    ((npy_longlong *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_longlong *)data_out + 2*0)[0] =
                                    ((npy_longlong *)data0 + 2*0)[0] +
                                    ((npy_longlong *)data_out + 2*0)[0];
            ((npy_longlong *)data_out + 2*0)[1] =
                                    ((npy_longlong *)data0 + 2*0)[1] +
                                    ((npy_longlong *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_longlong *)data_out + 2*0)[0] =
                                ((npy_longlong *)data0 + 2*0)[0] +
                                ((npy_longlong *)data_out + 2*0)[0];
        ((npy_longlong *)data_out + 2*0)[1] =
                                ((npy_longlong *)data0 + 2*0)[1] +
                                ((npy_longlong *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_longlong *)data_out + 2*1)[0] =
                                ((npy_longlong *)data0 + 2*1)[0] +
                                ((npy_longlong *)data_out + 2*1)[0];
        ((npy_longlong *)data_out + 2*1)[1] =
                                ((npy_longlong *)data0 + 2*1)[1] +
                                ((npy_longlong *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_longlong *)data_out + 2*2)[0] =
                                ((npy_longlong *)data0 + 2*2)[0] +
                                ((npy_longlong *)data_out + 2*2)[0];
        ((npy_longlong *)data_out + 2*2)[1] =
                                ((npy_longlong *)data0 + 2*2)[1] +
                                ((npy_longlong *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_longlong *)data_out + 2*3)[0] =
                                ((npy_longlong *)data0 + 2*3)[0] +
                                ((npy_longlong *)data_out + 2*3)[0];
        ((npy_longlong *)data_out + 2*3)[1] =
                                ((npy_longlong *)data0 + 2*3)[1] +
                                ((npy_longlong *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_longlong *)data_out + 2*4)[0] =
                                ((npy_longlong *)data0 + 2*4)[0] +
                                ((npy_longlong *)data_out + 2*4)[0];
        ((npy_longlong *)data_out + 2*4)[1] =
                                ((npy_longlong *)data0 + 2*4)[1] +
                                ((npy_longlong *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_longlong *)data_out + 2*5)[0] =
                                ((npy_longlong *)data0 + 2*5)[0] +
                                ((npy_longlong *)data_out + 2*5)[0];
        ((npy_longlong *)data_out + 2*5)[1] =
                                ((npy_longlong *)data0 + 2*5)[1] +
                                ((npy_longlong *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_longlong *)data_out + 2*6)[0] =
                                ((npy_longlong *)data0 + 2*6)[0] +
                                ((npy_longlong *)data_out + 2*6)[0];
        ((npy_longlong *)data_out + 2*6)[1] =
                                ((npy_longlong *)data0 + 2*6)[1] +
                                ((npy_longlong *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_longlong *)data_out + 2*7)[0] =
                                ((npy_longlong *)data0 + 2*7)[0] +
                                ((npy_longlong *)data_out + 2*7)[0];
        ((npy_longlong *)data_out + 2*7)[1] =
                                ((npy_longlong *)data0 + 2*7)[1] +
                                ((npy_longlong *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1000 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
longlong_sum_of_products_muladd(npy_longlong *data, npy_longlong *data_out, npy_longlong scalar, npy_intp count)
{
#if 0 // NPYV check for npy_longlong
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s64;
    const npyv_s64 v_scalar = npyv_setall_s64(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s64 b0 = npyv_loada_s64(data + vstep * 0);
            npyv_s64 c0 = npyv_loada_s64(data_out + vstep * 0);
            
#line 312
            npyv_s64 b1 = npyv_loada_s64(data + vstep * 1);
            npyv_s64 c1 = npyv_loada_s64(data_out + vstep * 1);
            
#line 312
            npyv_s64 b2 = npyv_loada_s64(data + vstep * 2);
            npyv_s64 c2 = npyv_loada_s64(data_out + vstep * 2);
            
#line 312
            npyv_s64 b3 = npyv_loada_s64(data + vstep * 3);
            npyv_s64 c3 = npyv_loada_s64(data_out + vstep * 3);
            
            #line 318
            npyv_s64 abc0 = npyv_muladd_s64(v_scalar, b0, c0);
            
#line 318
            npyv_s64 abc1 = npyv_muladd_s64(v_scalar, b1, c1);
            
#line 318
            npyv_s64 abc2 = npyv_muladd_s64(v_scalar, b2, c2);
            
#line 318
            npyv_s64 abc3 = npyv_muladd_s64(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_s64(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_s64(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_s64(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_s64(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_s64 b0 = npyv_load_s64(data + vstep * 0);
            npyv_s64 c0 = npyv_load_s64(data_out + vstep * 0);
            
#line 312
            npyv_s64 b1 = npyv_load_s64(data + vstep * 1);
            npyv_s64 c1 = npyv_load_s64(data_out + vstep * 1);
            
#line 312
            npyv_s64 b2 = npyv_load_s64(data + vstep * 2);
            npyv_s64 c2 = npyv_load_s64(data_out + vstep * 2);
            
#line 312
            npyv_s64 b3 = npyv_load_s64(data + vstep * 3);
            npyv_s64 c3 = npyv_load_s64(data_out + vstep * 3);
            
            #line 318
            npyv_s64 abc0 = npyv_muladd_s64(v_scalar, b0, c0);
            
#line 318
            npyv_s64 abc1 = npyv_muladd_s64(v_scalar, b1, c1);
            
#line 318
            npyv_s64 abc2 = npyv_muladd_s64(v_scalar, b2, c2);
            
#line 318
            npyv_s64 abc3 = npyv_muladd_s64(v_scalar, b3, c3);
            
            #line 323
            npyv_store_s64(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_s64(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_s64(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_s64(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_s64 a = npyv_load_tillz_s64(data, count);
        npyv_s64 b = npyv_load_tillz_s64(data_out, count);
        npyv_store_till_s64(data_out, count, npyv_muladd_s64(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_longlong b0 = (data[0]);
        const npy_longlong c0 = (data_out[0]);
        
#line 340
        const npy_longlong b1 = (data[1]);
        const npy_longlong c1 = (data_out[1]);
        
#line 340
        const npy_longlong b2 = (data[2]);
        const npy_longlong c2 = (data_out[2]);
        
#line 340
        const npy_longlong b3 = (data[3]);
        const npy_longlong c3 = (data_out[3]);
        
        #line 346
        const npy_longlong abc0 = scalar * b0 + c0;
        
#line 346
        const npy_longlong abc1 = scalar * b1 + c1;
        
#line 346
        const npy_longlong abc2 = scalar * b2 + c2;
        
#line 346
        const npy_longlong abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_longlong b = (*data);
        const npy_longlong c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_longlong
}

static void
longlong_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data_out = (npy_longlong *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_longlong
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_s64;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s64 a0 = npyv_loada_s64(data0 + vstep * 0);
            npyv_s64 b0 = npyv_loada_s64(data1 + vstep * 0);
            npyv_s64 c0 = npyv_loada_s64(data_out + vstep * 0);
            
#line 390
            npyv_s64 a1 = npyv_loada_s64(data0 + vstep * 1);
            npyv_s64 b1 = npyv_loada_s64(data1 + vstep * 1);
            npyv_s64 c1 = npyv_loada_s64(data_out + vstep * 1);
            
#line 390
            npyv_s64 a2 = npyv_loada_s64(data0 + vstep * 2);
            npyv_s64 b2 = npyv_loada_s64(data1 + vstep * 2);
            npyv_s64 c2 = npyv_loada_s64(data_out + vstep * 2);
            
#line 390
            npyv_s64 a3 = npyv_loada_s64(data0 + vstep * 3);
            npyv_s64 b3 = npyv_loada_s64(data1 + vstep * 3);
            npyv_s64 c3 = npyv_loada_s64(data_out + vstep * 3);
            
            #line 397
            npyv_s64 abc0 = npyv_muladd_s64(a0, b0, c0);
            
#line 397
            npyv_s64 abc1 = npyv_muladd_s64(a1, b1, c1);
            
#line 397
            npyv_s64 abc2 = npyv_muladd_s64(a2, b2, c2);
            
#line 397
            npyv_s64 abc3 = npyv_muladd_s64(a3, b3, c3);
            
            #line 402
            npyv_storea_s64(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_s64(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_s64(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_s64(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_s64 a0 = npyv_load_s64(data0 + vstep * 0);
            npyv_s64 b0 = npyv_load_s64(data1 + vstep * 0);
            npyv_s64 c0 = npyv_load_s64(data_out + vstep * 0);
            
#line 390
            npyv_s64 a1 = npyv_load_s64(data0 + vstep * 1);
            npyv_s64 b1 = npyv_load_s64(data1 + vstep * 1);
            npyv_s64 c1 = npyv_load_s64(data_out + vstep * 1);
            
#line 390
            npyv_s64 a2 = npyv_load_s64(data0 + vstep * 2);
            npyv_s64 b2 = npyv_load_s64(data1 + vstep * 2);
            npyv_s64 c2 = npyv_load_s64(data_out + vstep * 2);
            
#line 390
            npyv_s64 a3 = npyv_load_s64(data0 + vstep * 3);
            npyv_s64 b3 = npyv_load_s64(data1 + vstep * 3);
            npyv_s64 c3 = npyv_load_s64(data_out + vstep * 3);
            
            #line 397
            npyv_s64 abc0 = npyv_muladd_s64(a0, b0, c0);
            
#line 397
            npyv_s64 abc1 = npyv_muladd_s64(a1, b1, c1);
            
#line 397
            npyv_s64 abc2 = npyv_muladd_s64(a2, b2, c2);
            
#line 397
            npyv_s64 abc3 = npyv_muladd_s64(a3, b3, c3);
            
            #line 402
            npyv_store_s64(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_s64(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_s64(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_s64(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_s64 a = npyv_load_tillz_s64(data0, count);
        npyv_s64 b = npyv_load_tillz_s64(data1, count);
        npyv_s64 c = npyv_load_tillz_s64(data_out, count);
        npyv_store_till_s64(data_out, count, npyv_muladd_s64(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_longlong a0 = (data0[0]);
        const npy_longlong b0 = (data1[0]);
        const npy_longlong c0 = (data_out[0]);
        
#line 420
        const npy_longlong a1 = (data0[1]);
        const npy_longlong b1 = (data1[1]);
        const npy_longlong c1 = (data_out[1]);
        
#line 420
        const npy_longlong a2 = (data0[2]);
        const npy_longlong b2 = (data1[2]);
        const npy_longlong c2 = (data_out[2]);
        
#line 420
        const npy_longlong a3 = (data0[3]);
        const npy_longlong b3 = (data1[3]);
        const npy_longlong c3 = (data_out[3]);
        
        #line 427
        const npy_longlong abc0 = a0 * b0 + c0;
        
#line 427
        const npy_longlong abc1 = a1 * b1 + c1;
        
#line 427
        const npy_longlong abc2 = a2 * b2 + c2;
        
#line 427
        const npy_longlong abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_longlong a = (*data0);
        const npy_longlong b = (*data1);
        const npy_longlong c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_longlong

}

/* Some extra specializations for the two operand case */
static void
longlong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data_out = (npy_longlong *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    longlong_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
longlong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data_out = (npy_longlong *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    longlong_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
longlong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong accum = 0;

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_longlong
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_s64;
    npyv_s64 vaccum = npyv_zero_s64();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s64 a0 = npyv_loada_s64(data0 + vstep * 0);
            npyv_s64 b0 = npyv_loada_s64(data1 + vstep * 0);
            
#line 501
            npyv_s64 a1 = npyv_loada_s64(data0 + vstep * 1);
            npyv_s64 b1 = npyv_loada_s64(data1 + vstep * 1);
            
#line 501
            npyv_s64 a2 = npyv_loada_s64(data0 + vstep * 2);
            npyv_s64 b2 = npyv_loada_s64(data1 + vstep * 2);
            
#line 501
            npyv_s64 a3 = npyv_loada_s64(data0 + vstep * 3);
            npyv_s64 b3 = npyv_loada_s64(data1 + vstep * 3);
            
            npyv_s64 ab3 = npyv_muladd_s64(a3, b3, vaccum);
            npyv_s64 ab2 = npyv_muladd_s64(a2, b2, ab3);
            npyv_s64 ab1 = npyv_muladd_s64(a1, b1, ab2);
                    vaccum = npyv_muladd_s64(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_s64 a0 = npyv_load_s64(data0 + vstep * 0);
            npyv_s64 b0 = npyv_load_s64(data1 + vstep * 0);
            
#line 501
            npyv_s64 a1 = npyv_load_s64(data0 + vstep * 1);
            npyv_s64 b1 = npyv_load_s64(data1 + vstep * 1);
            
#line 501
            npyv_s64 a2 = npyv_load_s64(data0 + vstep * 2);
            npyv_s64 b2 = npyv_load_s64(data1 + vstep * 2);
            
#line 501
            npyv_s64 a3 = npyv_load_s64(data0 + vstep * 3);
            npyv_s64 b3 = npyv_load_s64(data1 + vstep * 3);
            
            npyv_s64 ab3 = npyv_muladd_s64(a3, b3, vaccum);
            npyv_s64 ab2 = npyv_muladd_s64(a2, b2, ab3);
            npyv_s64 ab1 = npyv_muladd_s64(a1, b1, ab2);
                    vaccum = npyv_muladd_s64(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_s64 a = npyv_load_tillz_s64(data0, count);
        npyv_s64 b = npyv_load_tillz_s64(data1, count);
        vaccum = npyv_muladd_s64(a, b, vaccum);
    }
    accum = npyv_sum_s64(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_longlong ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_longlong ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_longlong ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_longlong ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_longlong a = (*data0);
        const npy_longlong b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_longlong
    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
longlong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
    npy_longlong accum = longlong_sum_of_arr(data1, count);
    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
longlong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
    npy_longlong accum = longlong_sum_of_arr(data0, count);
    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + value1 * accum);
}

#elif 1000 == 3 && !0

static void
longlong_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
    npy_longlong *data1 = (npy_longlong *)dataptr[1];
    npy_longlong *data2 = (npy_longlong *)dataptr[2];
    npy_longlong *data_out = (npy_longlong *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1000 > 3 || @complex */

static void
longlong_sum_of_products_contig_any(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_longlong *)dataptr[i]);
        }
        *(npy_longlong *)dataptr[nop] = (temp +
                                           (*(npy_longlong *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_longlong);
        }
#else /* complex */
#  if 1000 <= 3
#    define _SUMPROD_NOP 1000
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_longlong re, im, tmp;
        int i;
        re = ((npy_longlong *)dataptr[0])[0];
        im = ((npy_longlong *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_longlong *)dataptr[i])[0] -
                  im * ((npy_longlong *)dataptr[i])[1];
            im = re * ((npy_longlong *)dataptr[i])[1] +
                 im * ((npy_longlong *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_longlong);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1000 */

#if 1000 == 1

static NPY_GCC_OPT_3 void
longlong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_longlong *data = (npy_longlong *)dataptr[0];
    npy_longlong accum = longlong_sum_of_arr(data, count);
    *((npy_longlong *)dataptr[1]) = (accum + (*((npy_longlong *)dataptr[1])));
#else
    npy_longlong accum_re = 0, accum_im = 0;
    npy_longlong *data0 = (npy_longlong *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_longlong re01 = data0[0] + data0[2];
        const npy_longlong re23 = data0[4] + data0[6];
        const npy_longlong im13 = data0[1] + data0[3];
        const npy_longlong im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_longlong *)dataptr[1])[0] += accum_re;
    ((npy_longlong *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 1000 == 1 */

static void
longlong_sum_of_products_outstride0_any(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_longlong accum_re = 0, accum_im = 0;
#else
    npy_longlong accum = 0;
#endif

#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_outstride0_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        accum += (*(npy_longlong *)data0);
        data0 += stride0;
#  elif 1000 == 2
        accum += (*(npy_longlong *)data0) *
                 (*(npy_longlong *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1000 == 3
        accum += (*(npy_longlong *)data0) *
                 (*(npy_longlong *)data1) *
                 (*(npy_longlong *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_longlong *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        accum_re += ((npy_longlong *)data0)[0];
        accum_im += ((npy_longlong *)data0)[1];
        data0 += stride0;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_longlong re, im, tmp;
        int i;
        re = ((npy_longlong *)dataptr[0])[0];
        im = ((npy_longlong *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_longlong *)dataptr[i])[0] -
                  im * ((npy_longlong *)dataptr[i])[1];
            im = re * ((npy_longlong *)dataptr[i])[1] +
                 im * ((npy_longlong *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1000 <= 3
    ((npy_longlong *)dataptr[1000])[0] += accum_re;
    ((npy_longlong *)dataptr[1000])[1] += accum_im;
#  else
    ((npy_longlong *)dataptr[nop])[0] += accum_re;
    ((npy_longlong *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1000 <= 3
    *((npy_longlong *)dataptr[1000]) = (accum +
                                    (*((npy_longlong *)dataptr[1000])));
#  else
    *((npy_longlong *)dataptr[nop]) = (accum +
                                    (*((npy_longlong *)dataptr[nop])));
#  endif
#endif

}




#line 74

#if !0
static NPY_GCC_OPT_3 npy_ubyte ubyte_sum_of_arr(npy_ubyte *data, npy_intp count)
{
    npy_ubyte accum = 0;
#if 0 // NPYV check for npy_ubyte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data);
    const int vstep = npyv_nlanes_u8;
    npyv_u8 vaccum = npyv_zero_u8();
    const npy_intp vstepx4 = vstep * 4;

    #line 91
    if(is_aligned) {
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
            #line 96
            npyv_u8 a0 = npyv_loada_u8(data + vstep * 0);
            
#line 96
            npyv_u8 a1 = npyv_loada_u8(data + vstep * 1);
            
#line 96
            npyv_u8 a2 = npyv_loada_u8(data + vstep * 2);
            
#line 96
            npyv_u8 a3 = npyv_loada_u8(data + vstep * 3);
            
            npyv_u8 a01   = npyv_add_u8(a0, a1);
            npyv_u8 a23   = npyv_add_u8(a2, a3);
            npyv_u8 a0123 = npyv_add_u8(a01, a23);
                      vaccum = npyv_add_u8(a0123, vaccum);
        }
    }
    
#line 91
    else {
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
            #line 96
            npyv_u8 a0 = npyv_load_u8(data + vstep * 0);
            
#line 96
            npyv_u8 a1 = npyv_load_u8(data + vstep * 1);
            
#line 96
            npyv_u8 a2 = npyv_load_u8(data + vstep * 2);
            
#line 96
            npyv_u8 a3 = npyv_load_u8(data + vstep * 3);
            
            npyv_u8 a01   = npyv_add_u8(a0, a1);
            npyv_u8 a23   = npyv_add_u8(a2, a3);
            npyv_u8 a0123 = npyv_add_u8(a01, a23);
                      vaccum = npyv_add_u8(a0123, vaccum);
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep) {
        npyv_u8 a = npyv_load_tillz_u8(data, count);
        vaccum = npyv_add_u8(a, vaccum);
    }
    accum = npyv_sum_u8(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data += 4) {
        const npy_ubyte a01 = (*data) + (data[1]);
        const npy_ubyte a23 = (data[2]) + (data[3]);
        accum +=  a01 + a23;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data++) {
        accum += (*data);
    }
#endif // NPYV check for npy_ubyte
    return accum;
}
#endif

#line 131
static void
ubyte_sum_of_products_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1 == 1) || (1 <= 3 && !0)
    char *data_out = dataptr[1];
    npy_intp stride_out = strides[1];
#endif

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_one (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) +
                                         (*(npy_ubyte *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1 == 2
        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) *
                                         (*(npy_ubyte *)data1) +
                                         (*(npy_ubyte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1 == 3
        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) *
                                         (*(npy_ubyte *)data1) *
                                         (*(npy_ubyte *)data2) +
                                         (*(npy_ubyte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ubyte *)dataptr[i]);
        }
        *(npy_ubyte *)dataptr[nop] = (temp +
                                           (*(npy_ubyte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        ((npy_ubyte *)data_out)[0] = ((npy_ubyte *)data0)[0] +
                                         ((npy_ubyte *)data_out)[0];
        ((npy_ubyte *)data_out)[1] = ((npy_ubyte *)data0)[1] +
                                         ((npy_ubyte *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_ubyte re, im, tmp;
        int i;
        re = ((npy_ubyte *)dataptr[0])[0];
        im = ((npy_ubyte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
                  im * ((npy_ubyte *)dataptr[i])[1];
            im = re * ((npy_ubyte *)dataptr[i])[1] +
                 im * ((npy_ubyte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1 == 1

static void
ubyte_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_ubyte *)data_out + 2*6)[0] =
                                    ((npy_ubyte *)data0 + 2*6)[0] +
                                    ((npy_ubyte *)data_out + 2*6)[0];
            ((npy_ubyte *)data_out + 2*6)[1] =
                                    ((npy_ubyte *)data0 + 2*6)[1] +
                                    ((npy_ubyte *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_ubyte *)data_out + 2*5)[0] =
                                    ((npy_ubyte *)data0 + 2*5)[0] +
                                    ((npy_ubyte *)data_out + 2*5)[0];
            ((npy_ubyte *)data_out + 2*5)[1] =
                                    ((npy_ubyte *)data0 + 2*5)[1] +
                                    ((npy_ubyte *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_ubyte *)data_out + 2*4)[0] =
                                    ((npy_ubyte *)data0 + 2*4)[0] +
                                    ((npy_ubyte *)data_out + 2*4)[0];
            ((npy_ubyte *)data_out + 2*4)[1] =
                                    ((npy_ubyte *)data0 + 2*4)[1] +
                                    ((npy_ubyte *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_ubyte *)data_out + 2*3)[0] =
                                    ((npy_ubyte *)data0 + 2*3)[0] +
                                    ((npy_ubyte *)data_out + 2*3)[0];
            ((npy_ubyte *)data_out + 2*3)[1] =
                                    ((npy_ubyte *)data0 + 2*3)[1] +
                                    ((npy_ubyte *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_ubyte *)data_out + 2*2)[0] =
                                    ((npy_ubyte *)data0 + 2*2)[0] +
                                    ((npy_ubyte *)data_out + 2*2)[0];
            ((npy_ubyte *)data_out + 2*2)[1] =
                                    ((npy_ubyte *)data0 + 2*2)[1] +
                                    ((npy_ubyte *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_ubyte *)data_out + 2*1)[0] =
                                    ((npy_ubyte *)data0 + 2*1)[0] +
                                    ((npy_ubyte *)data_out + 2*1)[0];
            ((npy_ubyte *)data_out + 2*1)[1] =
                                    ((npy_ubyte *)data0 + 2*1)[1] +
                                    ((npy_ubyte *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_ubyte *)data_out + 2*0)[0] =
                                    ((npy_ubyte *)data0 + 2*0)[0] +
                                    ((npy_ubyte *)data_out + 2*0)[0];
            ((npy_ubyte *)data_out + 2*0)[1] =
                                    ((npy_ubyte *)data0 + 2*0)[1] +
                                    ((npy_ubyte *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*0)[0] =
                                ((npy_ubyte *)data0 + 2*0)[0] +
                                ((npy_ubyte *)data_out + 2*0)[0];
        ((npy_ubyte *)data_out + 2*0)[1] =
                                ((npy_ubyte *)data0 + 2*0)[1] +
                                ((npy_ubyte *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*1)[0] =
                                ((npy_ubyte *)data0 + 2*1)[0] +
                                ((npy_ubyte *)data_out + 2*1)[0];
        ((npy_ubyte *)data_out + 2*1)[1] =
                                ((npy_ubyte *)data0 + 2*1)[1] +
                                ((npy_ubyte *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*2)[0] =
                                ((npy_ubyte *)data0 + 2*2)[0] +
                                ((npy_ubyte *)data_out + 2*2)[0];
        ((npy_ubyte *)data_out + 2*2)[1] =
                                ((npy_ubyte *)data0 + 2*2)[1] +
                                ((npy_ubyte *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*3)[0] =
                                ((npy_ubyte *)data0 + 2*3)[0] +
                                ((npy_ubyte *)data_out + 2*3)[0];
        ((npy_ubyte *)data_out + 2*3)[1] =
                                ((npy_ubyte *)data0 + 2*3)[1] +
                                ((npy_ubyte *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*4)[0] =
                                ((npy_ubyte *)data0 + 2*4)[0] +
                                ((npy_ubyte *)data_out + 2*4)[0];
        ((npy_ubyte *)data_out + 2*4)[1] =
                                ((npy_ubyte *)data0 + 2*4)[1] +
                                ((npy_ubyte *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*5)[0] =
                                ((npy_ubyte *)data0 + 2*5)[0] +
                                ((npy_ubyte *)data_out + 2*5)[0];
        ((npy_ubyte *)data_out + 2*5)[1] =
                                ((npy_ubyte *)data0 + 2*5)[1] +
                                ((npy_ubyte *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*6)[0] =
                                ((npy_ubyte *)data0 + 2*6)[0] +
                                ((npy_ubyte *)data_out + 2*6)[0];
        ((npy_ubyte *)data_out + 2*6)[1] =
                                ((npy_ubyte *)data0 + 2*6)[1] +
                                ((npy_ubyte *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*7)[0] =
                                ((npy_ubyte *)data0 + 2*7)[0] +
                                ((npy_ubyte *)data_out + 2*7)[0];
        ((npy_ubyte *)data_out + 2*7)[1] =
                                ((npy_ubyte *)data0 + 2*7)[1] +
                                ((npy_ubyte *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
ubyte_sum_of_products_muladd(npy_ubyte *data, npy_ubyte *data_out, npy_ubyte scalar, npy_intp count)
{
#if 0 // NPYV check for npy_ubyte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_u8;
    const npyv_u8 v_scalar = npyv_setall_u8(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_u8 b0 = npyv_loada_u8(data + vstep * 0);
            npyv_u8 c0 = npyv_loada_u8(data_out + vstep * 0);
            
#line 312
            npyv_u8 b1 = npyv_loada_u8(data + vstep * 1);
            npyv_u8 c1 = npyv_loada_u8(data_out + vstep * 1);
            
#line 312
            npyv_u8 b2 = npyv_loada_u8(data + vstep * 2);
            npyv_u8 c2 = npyv_loada_u8(data_out + vstep * 2);
            
#line 312
            npyv_u8 b3 = npyv_loada_u8(data + vstep * 3);
            npyv_u8 c3 = npyv_loada_u8(data_out + vstep * 3);
            
            #line 318
            npyv_u8 abc0 = npyv_muladd_u8(v_scalar, b0, c0);
            
#line 318
            npyv_u8 abc1 = npyv_muladd_u8(v_scalar, b1, c1);
            
#line 318
            npyv_u8 abc2 = npyv_muladd_u8(v_scalar, b2, c2);
            
#line 318
            npyv_u8 abc3 = npyv_muladd_u8(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_u8(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_u8(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_u8(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_u8(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_u8 b0 = npyv_load_u8(data + vstep * 0);
            npyv_u8 c0 = npyv_load_u8(data_out + vstep * 0);
            
#line 312
            npyv_u8 b1 = npyv_load_u8(data + vstep * 1);
            npyv_u8 c1 = npyv_load_u8(data_out + vstep * 1);
            
#line 312
            npyv_u8 b2 = npyv_load_u8(data + vstep * 2);
            npyv_u8 c2 = npyv_load_u8(data_out + vstep * 2);
            
#line 312
            npyv_u8 b3 = npyv_load_u8(data + vstep * 3);
            npyv_u8 c3 = npyv_load_u8(data_out + vstep * 3);
            
            #line 318
            npyv_u8 abc0 = npyv_muladd_u8(v_scalar, b0, c0);
            
#line 318
            npyv_u8 abc1 = npyv_muladd_u8(v_scalar, b1, c1);
            
#line 318
            npyv_u8 abc2 = npyv_muladd_u8(v_scalar, b2, c2);
            
#line 318
            npyv_u8 abc3 = npyv_muladd_u8(v_scalar, b3, c3);
            
            #line 323
            npyv_store_u8(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_u8(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_u8(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_u8(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_u8 a = npyv_load_tillz_u8(data, count);
        npyv_u8 b = npyv_load_tillz_u8(data_out, count);
        npyv_store_till_u8(data_out, count, npyv_muladd_u8(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_ubyte b0 = (data[0]);
        const npy_ubyte c0 = (data_out[0]);
        
#line 340
        const npy_ubyte b1 = (data[1]);
        const npy_ubyte c1 = (data_out[1]);
        
#line 340
        const npy_ubyte b2 = (data[2]);
        const npy_ubyte c2 = (data_out[2]);
        
#line 340
        const npy_ubyte b3 = (data[3]);
        const npy_ubyte c3 = (data_out[3]);
        
        #line 346
        const npy_ubyte abc0 = scalar * b0 + c0;
        
#line 346
        const npy_ubyte abc1 = scalar * b1 + c1;
        
#line 346
        const npy_ubyte abc2 = scalar * b2 + c2;
        
#line 346
        const npy_ubyte abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_ubyte b = (*data);
        const npy_ubyte c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_ubyte
}

static void
ubyte_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_ubyte
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_u8;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_u8 a0 = npyv_loada_u8(data0 + vstep * 0);
            npyv_u8 b0 = npyv_loada_u8(data1 + vstep * 0);
            npyv_u8 c0 = npyv_loada_u8(data_out + vstep * 0);
            
#line 390
            npyv_u8 a1 = npyv_loada_u8(data0 + vstep * 1);
            npyv_u8 b1 = npyv_loada_u8(data1 + vstep * 1);
            npyv_u8 c1 = npyv_loada_u8(data_out + vstep * 1);
            
#line 390
            npyv_u8 a2 = npyv_loada_u8(data0 + vstep * 2);
            npyv_u8 b2 = npyv_loada_u8(data1 + vstep * 2);
            npyv_u8 c2 = npyv_loada_u8(data_out + vstep * 2);
            
#line 390
            npyv_u8 a3 = npyv_loada_u8(data0 + vstep * 3);
            npyv_u8 b3 = npyv_loada_u8(data1 + vstep * 3);
            npyv_u8 c3 = npyv_loada_u8(data_out + vstep * 3);
            
            #line 397
            npyv_u8 abc0 = npyv_muladd_u8(a0, b0, c0);
            
#line 397
            npyv_u8 abc1 = npyv_muladd_u8(a1, b1, c1);
            
#line 397
            npyv_u8 abc2 = npyv_muladd_u8(a2, b2, c2);
            
#line 397
            npyv_u8 abc3 = npyv_muladd_u8(a3, b3, c3);
            
            #line 402
            npyv_storea_u8(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_u8(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_u8(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_u8(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_u8 a0 = npyv_load_u8(data0 + vstep * 0);
            npyv_u8 b0 = npyv_load_u8(data1 + vstep * 0);
            npyv_u8 c0 = npyv_load_u8(data_out + vstep * 0);
            
#line 390
            npyv_u8 a1 = npyv_load_u8(data0 + vstep * 1);
            npyv_u8 b1 = npyv_load_u8(data1 + vstep * 1);
            npyv_u8 c1 = npyv_load_u8(data_out + vstep * 1);
            
#line 390
            npyv_u8 a2 = npyv_load_u8(data0 + vstep * 2);
            npyv_u8 b2 = npyv_load_u8(data1 + vstep * 2);
            npyv_u8 c2 = npyv_load_u8(data_out + vstep * 2);
            
#line 390
            npyv_u8 a3 = npyv_load_u8(data0 + vstep * 3);
            npyv_u8 b3 = npyv_load_u8(data1 + vstep * 3);
            npyv_u8 c3 = npyv_load_u8(data_out + vstep * 3);
            
            #line 397
            npyv_u8 abc0 = npyv_muladd_u8(a0, b0, c0);
            
#line 397
            npyv_u8 abc1 = npyv_muladd_u8(a1, b1, c1);
            
#line 397
            npyv_u8 abc2 = npyv_muladd_u8(a2, b2, c2);
            
#line 397
            npyv_u8 abc3 = npyv_muladd_u8(a3, b3, c3);
            
            #line 402
            npyv_store_u8(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_u8(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_u8(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_u8(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_u8 a = npyv_load_tillz_u8(data0, count);
        npyv_u8 b = npyv_load_tillz_u8(data1, count);
        npyv_u8 c = npyv_load_tillz_u8(data_out, count);
        npyv_store_till_u8(data_out, count, npyv_muladd_u8(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_ubyte a0 = (data0[0]);
        const npy_ubyte b0 = (data1[0]);
        const npy_ubyte c0 = (data_out[0]);
        
#line 420
        const npy_ubyte a1 = (data0[1]);
        const npy_ubyte b1 = (data1[1]);
        const npy_ubyte c1 = (data_out[1]);
        
#line 420
        const npy_ubyte a2 = (data0[2]);
        const npy_ubyte b2 = (data1[2]);
        const npy_ubyte c2 = (data_out[2]);
        
#line 420
        const npy_ubyte a3 = (data0[3]);
        const npy_ubyte b3 = (data1[3]);
        const npy_ubyte c3 = (data_out[3]);
        
        #line 427
        const npy_ubyte abc0 = a0 * b0 + c0;
        
#line 427
        const npy_ubyte abc1 = a1 * b1 + c1;
        
#line 427
        const npy_ubyte abc2 = a2 * b2 + c2;
        
#line 427
        const npy_ubyte abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_ubyte a = (*data0);
        const npy_ubyte b = (*data1);
        const npy_ubyte c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_ubyte

}

/* Some extra specializations for the two operand case */
static void
ubyte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte value0 = (*(npy_ubyte *)dataptr[0]);
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    ubyte_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
ubyte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte value1 = (*(npy_ubyte *)dataptr[1]);
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    ubyte_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
ubyte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte accum = 0;

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_ubyte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_u8;
    npyv_u8 vaccum = npyv_zero_u8();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_u8 a0 = npyv_loada_u8(data0 + vstep * 0);
            npyv_u8 b0 = npyv_loada_u8(data1 + vstep * 0);
            
#line 501
            npyv_u8 a1 = npyv_loada_u8(data0 + vstep * 1);
            npyv_u8 b1 = npyv_loada_u8(data1 + vstep * 1);
            
#line 501
            npyv_u8 a2 = npyv_loada_u8(data0 + vstep * 2);
            npyv_u8 b2 = npyv_loada_u8(data1 + vstep * 2);
            
#line 501
            npyv_u8 a3 = npyv_loada_u8(data0 + vstep * 3);
            npyv_u8 b3 = npyv_loada_u8(data1 + vstep * 3);
            
            npyv_u8 ab3 = npyv_muladd_u8(a3, b3, vaccum);
            npyv_u8 ab2 = npyv_muladd_u8(a2, b2, ab3);
            npyv_u8 ab1 = npyv_muladd_u8(a1, b1, ab2);
                    vaccum = npyv_muladd_u8(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_u8 a0 = npyv_load_u8(data0 + vstep * 0);
            npyv_u8 b0 = npyv_load_u8(data1 + vstep * 0);
            
#line 501
            npyv_u8 a1 = npyv_load_u8(data0 + vstep * 1);
            npyv_u8 b1 = npyv_load_u8(data1 + vstep * 1);
            
#line 501
            npyv_u8 a2 = npyv_load_u8(data0 + vstep * 2);
            npyv_u8 b2 = npyv_load_u8(data1 + vstep * 2);
            
#line 501
            npyv_u8 a3 = npyv_load_u8(data0 + vstep * 3);
            npyv_u8 b3 = npyv_load_u8(data1 + vstep * 3);
            
            npyv_u8 ab3 = npyv_muladd_u8(a3, b3, vaccum);
            npyv_u8 ab2 = npyv_muladd_u8(a2, b2, ab3);
            npyv_u8 ab1 = npyv_muladd_u8(a1, b1, ab2);
                    vaccum = npyv_muladd_u8(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_u8 a = npyv_load_tillz_u8(data0, count);
        npyv_u8 b = npyv_load_tillz_u8(data1, count);
        vaccum = npyv_muladd_u8(a, b, vaccum);
    }
    accum = npyv_sum_u8(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_ubyte ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_ubyte ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_ubyte ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_ubyte ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_ubyte a = (*data0);
        const npy_ubyte b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_ubyte
    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
ubyte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte value0 = (*(npy_ubyte *)dataptr[0]);
    npy_ubyte accum = ubyte_sum_of_arr(data1, count);
    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
ubyte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte value1 = (*(npy_ubyte *)dataptr[1]);
    npy_ubyte accum = ubyte_sum_of_arr(data0, count);
    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + value1 * accum);
}

#elif 1 == 3 && !0

static void
ubyte_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte *data2 = (npy_ubyte *)dataptr[2];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1 > 3 || @complex */

static void
ubyte_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ubyte *)dataptr[i]);
        }
        *(npy_ubyte *)dataptr[nop] = (temp +
                                           (*(npy_ubyte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_ubyte);
        }
#else /* complex */
#  if 1 <= 3
#    define _SUMPROD_NOP 1
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_ubyte re, im, tmp;
        int i;
        re = ((npy_ubyte *)dataptr[0])[0];
        im = ((npy_ubyte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
                  im * ((npy_ubyte *)dataptr[i])[1];
            im = re * ((npy_ubyte *)dataptr[i])[1] +
                 im * ((npy_ubyte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_ubyte);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1 */

#if 1 == 1

static NPY_GCC_OPT_3 void
ubyte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_ubyte *data = (npy_ubyte *)dataptr[0];
    npy_ubyte accum = ubyte_sum_of_arr(data, count);
    *((npy_ubyte *)dataptr[1]) = (accum + (*((npy_ubyte *)dataptr[1])));
#else
    npy_ubyte accum_re = 0, accum_im = 0;
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_ubyte re01 = data0[0] + data0[2];
        const npy_ubyte re23 = data0[4] + data0[6];
        const npy_ubyte im13 = data0[1] + data0[3];
        const npy_ubyte im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_ubyte *)dataptr[1])[0] += accum_re;
    ((npy_ubyte *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 1 == 1 */

static void
ubyte_sum_of_products_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_ubyte accum_re = 0, accum_im = 0;
#else
    npy_ubyte accum = 0;
#endif

#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_outstride0_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        accum += (*(npy_ubyte *)data0);
        data0 += stride0;
#  elif 1 == 2
        accum += (*(npy_ubyte *)data0) *
                 (*(npy_ubyte *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1 == 3
        accum += (*(npy_ubyte *)data0) *
                 (*(npy_ubyte *)data1) *
                 (*(npy_ubyte *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ubyte *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        accum_re += ((npy_ubyte *)data0)[0];
        accum_im += ((npy_ubyte *)data0)[1];
        data0 += stride0;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_ubyte re, im, tmp;
        int i;
        re = ((npy_ubyte *)dataptr[0])[0];
        im = ((npy_ubyte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
                  im * ((npy_ubyte *)dataptr[i])[1];
            im = re * ((npy_ubyte *)dataptr[i])[1] +
                 im * ((npy_ubyte *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1 <= 3
    ((npy_ubyte *)dataptr[1])[0] += accum_re;
    ((npy_ubyte *)dataptr[1])[1] += accum_im;
#  else
    ((npy_ubyte *)dataptr[nop])[0] += accum_re;
    ((npy_ubyte *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1 <= 3
    *((npy_ubyte *)dataptr[1]) = (accum +
                                    (*((npy_ubyte *)dataptr[1])));
#  else
    *((npy_ubyte *)dataptr[nop]) = (accum +
                                    (*((npy_ubyte *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
ubyte_sum_of_products_two(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (2 == 1) || (2 <= 3 && !0)
    char *data_out = dataptr[2];
    npy_intp stride_out = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_two (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) +
                                         (*(npy_ubyte *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 2 == 2
        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) *
                                         (*(npy_ubyte *)data1) +
                                         (*(npy_ubyte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 2 == 3
        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) *
                                         (*(npy_ubyte *)data1) *
                                         (*(npy_ubyte *)data2) +
                                         (*(npy_ubyte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ubyte *)dataptr[i]);
        }
        *(npy_ubyte *)dataptr[nop] = (temp +
                                           (*(npy_ubyte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        ((npy_ubyte *)data_out)[0] = ((npy_ubyte *)data0)[0] +
                                         ((npy_ubyte *)data_out)[0];
        ((npy_ubyte *)data_out)[1] = ((npy_ubyte *)data0)[1] +
                                         ((npy_ubyte *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_ubyte re, im, tmp;
        int i;
        re = ((npy_ubyte *)dataptr[0])[0];
        im = ((npy_ubyte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
                  im * ((npy_ubyte *)dataptr[i])[1];
            im = re * ((npy_ubyte *)dataptr[i])[1] +
                 im * ((npy_ubyte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 2 == 1

static void
ubyte_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_ubyte *)data_out + 2*6)[0] =
                                    ((npy_ubyte *)data0 + 2*6)[0] +
                                    ((npy_ubyte *)data_out + 2*6)[0];
            ((npy_ubyte *)data_out + 2*6)[1] =
                                    ((npy_ubyte *)data0 + 2*6)[1] +
                                    ((npy_ubyte *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_ubyte *)data_out + 2*5)[0] =
                                    ((npy_ubyte *)data0 + 2*5)[0] +
                                    ((npy_ubyte *)data_out + 2*5)[0];
            ((npy_ubyte *)data_out + 2*5)[1] =
                                    ((npy_ubyte *)data0 + 2*5)[1] +
                                    ((npy_ubyte *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_ubyte *)data_out + 2*4)[0] =
                                    ((npy_ubyte *)data0 + 2*4)[0] +
                                    ((npy_ubyte *)data_out + 2*4)[0];
            ((npy_ubyte *)data_out + 2*4)[1] =
                                    ((npy_ubyte *)data0 + 2*4)[1] +
                                    ((npy_ubyte *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_ubyte *)data_out + 2*3)[0] =
                                    ((npy_ubyte *)data0 + 2*3)[0] +
                                    ((npy_ubyte *)data_out + 2*3)[0];
            ((npy_ubyte *)data_out + 2*3)[1] =
                                    ((npy_ubyte *)data0 + 2*3)[1] +
                                    ((npy_ubyte *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_ubyte *)data_out + 2*2)[0] =
                                    ((npy_ubyte *)data0 + 2*2)[0] +
                                    ((npy_ubyte *)data_out + 2*2)[0];
            ((npy_ubyte *)data_out + 2*2)[1] =
                                    ((npy_ubyte *)data0 + 2*2)[1] +
                                    ((npy_ubyte *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_ubyte *)data_out + 2*1)[0] =
                                    ((npy_ubyte *)data0 + 2*1)[0] +
                                    ((npy_ubyte *)data_out + 2*1)[0];
            ((npy_ubyte *)data_out + 2*1)[1] =
                                    ((npy_ubyte *)data0 + 2*1)[1] +
                                    ((npy_ubyte *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_ubyte *)data_out + 2*0)[0] =
                                    ((npy_ubyte *)data0 + 2*0)[0] +
                                    ((npy_ubyte *)data_out + 2*0)[0];
            ((npy_ubyte *)data_out + 2*0)[1] =
                                    ((npy_ubyte *)data0 + 2*0)[1] +
                                    ((npy_ubyte *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*0)[0] =
                                ((npy_ubyte *)data0 + 2*0)[0] +
                                ((npy_ubyte *)data_out + 2*0)[0];
        ((npy_ubyte *)data_out + 2*0)[1] =
                                ((npy_ubyte *)data0 + 2*0)[1] +
                                ((npy_ubyte *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*1)[0] =
                                ((npy_ubyte *)data0 + 2*1)[0] +
                                ((npy_ubyte *)data_out + 2*1)[0];
        ((npy_ubyte *)data_out + 2*1)[1] =
                                ((npy_ubyte *)data0 + 2*1)[1] +
                                ((npy_ubyte *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*2)[0] =
                                ((npy_ubyte *)data0 + 2*2)[0] +
                                ((npy_ubyte *)data_out + 2*2)[0];
        ((npy_ubyte *)data_out + 2*2)[1] =
                                ((npy_ubyte *)data0 + 2*2)[1] +
                                ((npy_ubyte *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*3)[0] =
                                ((npy_ubyte *)data0 + 2*3)[0] +
                                ((npy_ubyte *)data_out + 2*3)[0];
        ((npy_ubyte *)data_out + 2*3)[1] =
                                ((npy_ubyte *)data0 + 2*3)[1] +
                                ((npy_ubyte *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*4)[0] =
                                ((npy_ubyte *)data0 + 2*4)[0] +
                                ((npy_ubyte *)data_out + 2*4)[0];
        ((npy_ubyte *)data_out + 2*4)[1] =
                                ((npy_ubyte *)data0 + 2*4)[1] +
                                ((npy_ubyte *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*5)[0] =
                                ((npy_ubyte *)data0 + 2*5)[0] +
                                ((npy_ubyte *)data_out + 2*5)[0];
        ((npy_ubyte *)data_out + 2*5)[1] =
                                ((npy_ubyte *)data0 + 2*5)[1] +
                                ((npy_ubyte *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*6)[0] =
                                ((npy_ubyte *)data0 + 2*6)[0] +
                                ((npy_ubyte *)data_out + 2*6)[0];
        ((npy_ubyte *)data_out + 2*6)[1] =
                                ((npy_ubyte *)data0 + 2*6)[1] +
                                ((npy_ubyte *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*7)[0] =
                                ((npy_ubyte *)data0 + 2*7)[0] +
                                ((npy_ubyte *)data_out + 2*7)[0];
        ((npy_ubyte *)data_out + 2*7)[1] =
                                ((npy_ubyte *)data0 + 2*7)[1] +
                                ((npy_ubyte *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 2 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
ubyte_sum_of_products_muladd(npy_ubyte *data, npy_ubyte *data_out, npy_ubyte scalar, npy_intp count)
{
#if 0 // NPYV check for npy_ubyte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_u8;
    const npyv_u8 v_scalar = npyv_setall_u8(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_u8 b0 = npyv_loada_u8(data + vstep * 0);
            npyv_u8 c0 = npyv_loada_u8(data_out + vstep * 0);
            
#line 312
            npyv_u8 b1 = npyv_loada_u8(data + vstep * 1);
            npyv_u8 c1 = npyv_loada_u8(data_out + vstep * 1);
            
#line 312
            npyv_u8 b2 = npyv_loada_u8(data + vstep * 2);
            npyv_u8 c2 = npyv_loada_u8(data_out + vstep * 2);
            
#line 312
            npyv_u8 b3 = npyv_loada_u8(data + vstep * 3);
            npyv_u8 c3 = npyv_loada_u8(data_out + vstep * 3);
            
            #line 318
            npyv_u8 abc0 = npyv_muladd_u8(v_scalar, b0, c0);
            
#line 318
            npyv_u8 abc1 = npyv_muladd_u8(v_scalar, b1, c1);
            
#line 318
            npyv_u8 abc2 = npyv_muladd_u8(v_scalar, b2, c2);
            
#line 318
            npyv_u8 abc3 = npyv_muladd_u8(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_u8(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_u8(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_u8(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_u8(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_u8 b0 = npyv_load_u8(data + vstep * 0);
            npyv_u8 c0 = npyv_load_u8(data_out + vstep * 0);
            
#line 312
            npyv_u8 b1 = npyv_load_u8(data + vstep * 1);
            npyv_u8 c1 = npyv_load_u8(data_out + vstep * 1);
            
#line 312
            npyv_u8 b2 = npyv_load_u8(data + vstep * 2);
            npyv_u8 c2 = npyv_load_u8(data_out + vstep * 2);
            
#line 312
            npyv_u8 b3 = npyv_load_u8(data + vstep * 3);
            npyv_u8 c3 = npyv_load_u8(data_out + vstep * 3);
            
            #line 318
            npyv_u8 abc0 = npyv_muladd_u8(v_scalar, b0, c0);
            
#line 318
            npyv_u8 abc1 = npyv_muladd_u8(v_scalar, b1, c1);
            
#line 318
            npyv_u8 abc2 = npyv_muladd_u8(v_scalar, b2, c2);
            
#line 318
            npyv_u8 abc3 = npyv_muladd_u8(v_scalar, b3, c3);
            
            #line 323
            npyv_store_u8(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_u8(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_u8(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_u8(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_u8 a = npyv_load_tillz_u8(data, count);
        npyv_u8 b = npyv_load_tillz_u8(data_out, count);
        npyv_store_till_u8(data_out, count, npyv_muladd_u8(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_ubyte b0 = (data[0]);
        const npy_ubyte c0 = (data_out[0]);
        
#line 340
        const npy_ubyte b1 = (data[1]);
        const npy_ubyte c1 = (data_out[1]);
        
#line 340
        const npy_ubyte b2 = (data[2]);
        const npy_ubyte c2 = (data_out[2]);
        
#line 340
        const npy_ubyte b3 = (data[3]);
        const npy_ubyte c3 = (data_out[3]);
        
        #line 346
        const npy_ubyte abc0 = scalar * b0 + c0;
        
#line 346
        const npy_ubyte abc1 = scalar * b1 + c1;
        
#line 346
        const npy_ubyte abc2 = scalar * b2 + c2;
        
#line 346
        const npy_ubyte abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_ubyte b = (*data);
        const npy_ubyte c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_ubyte
}

static void
ubyte_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_ubyte
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_u8;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_u8 a0 = npyv_loada_u8(data0 + vstep * 0);
            npyv_u8 b0 = npyv_loada_u8(data1 + vstep * 0);
            npyv_u8 c0 = npyv_loada_u8(data_out + vstep * 0);
            
#line 390
            npyv_u8 a1 = npyv_loada_u8(data0 + vstep * 1);
            npyv_u8 b1 = npyv_loada_u8(data1 + vstep * 1);
            npyv_u8 c1 = npyv_loada_u8(data_out + vstep * 1);
            
#line 390
            npyv_u8 a2 = npyv_loada_u8(data0 + vstep * 2);
            npyv_u8 b2 = npyv_loada_u8(data1 + vstep * 2);
            npyv_u8 c2 = npyv_loada_u8(data_out + vstep * 2);
            
#line 390
            npyv_u8 a3 = npyv_loada_u8(data0 + vstep * 3);
            npyv_u8 b3 = npyv_loada_u8(data1 + vstep * 3);
            npyv_u8 c3 = npyv_loada_u8(data_out + vstep * 3);
            
            #line 397
            npyv_u8 abc0 = npyv_muladd_u8(a0, b0, c0);
            
#line 397
            npyv_u8 abc1 = npyv_muladd_u8(a1, b1, c1);
            
#line 397
            npyv_u8 abc2 = npyv_muladd_u8(a2, b2, c2);
            
#line 397
            npyv_u8 abc3 = npyv_muladd_u8(a3, b3, c3);
            
            #line 402
            npyv_storea_u8(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_u8(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_u8(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_u8(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_u8 a0 = npyv_load_u8(data0 + vstep * 0);
            npyv_u8 b0 = npyv_load_u8(data1 + vstep * 0);
            npyv_u8 c0 = npyv_load_u8(data_out + vstep * 0);
            
#line 390
            npyv_u8 a1 = npyv_load_u8(data0 + vstep * 1);
            npyv_u8 b1 = npyv_load_u8(data1 + vstep * 1);
            npyv_u8 c1 = npyv_load_u8(data_out + vstep * 1);
            
#line 390
            npyv_u8 a2 = npyv_load_u8(data0 + vstep * 2);
            npyv_u8 b2 = npyv_load_u8(data1 + vstep * 2);
            npyv_u8 c2 = npyv_load_u8(data_out + vstep * 2);
            
#line 390
            npyv_u8 a3 = npyv_load_u8(data0 + vstep * 3);
            npyv_u8 b3 = npyv_load_u8(data1 + vstep * 3);
            npyv_u8 c3 = npyv_load_u8(data_out + vstep * 3);
            
            #line 397
            npyv_u8 abc0 = npyv_muladd_u8(a0, b0, c0);
            
#line 397
            npyv_u8 abc1 = npyv_muladd_u8(a1, b1, c1);
            
#line 397
            npyv_u8 abc2 = npyv_muladd_u8(a2, b2, c2);
            
#line 397
            npyv_u8 abc3 = npyv_muladd_u8(a3, b3, c3);
            
            #line 402
            npyv_store_u8(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_u8(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_u8(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_u8(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_u8 a = npyv_load_tillz_u8(data0, count);
        npyv_u8 b = npyv_load_tillz_u8(data1, count);
        npyv_u8 c = npyv_load_tillz_u8(data_out, count);
        npyv_store_till_u8(data_out, count, npyv_muladd_u8(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_ubyte a0 = (data0[0]);
        const npy_ubyte b0 = (data1[0]);
        const npy_ubyte c0 = (data_out[0]);
        
#line 420
        const npy_ubyte a1 = (data0[1]);
        const npy_ubyte b1 = (data1[1]);
        const npy_ubyte c1 = (data_out[1]);
        
#line 420
        const npy_ubyte a2 = (data0[2]);
        const npy_ubyte b2 = (data1[2]);
        const npy_ubyte c2 = (data_out[2]);
        
#line 420
        const npy_ubyte a3 = (data0[3]);
        const npy_ubyte b3 = (data1[3]);
        const npy_ubyte c3 = (data_out[3]);
        
        #line 427
        const npy_ubyte abc0 = a0 * b0 + c0;
        
#line 427
        const npy_ubyte abc1 = a1 * b1 + c1;
        
#line 427
        const npy_ubyte abc2 = a2 * b2 + c2;
        
#line 427
        const npy_ubyte abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_ubyte a = (*data0);
        const npy_ubyte b = (*data1);
        const npy_ubyte c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_ubyte

}

/* Some extra specializations for the two operand case */
static void
ubyte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte value0 = (*(npy_ubyte *)dataptr[0]);
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    ubyte_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
ubyte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte value1 = (*(npy_ubyte *)dataptr[1]);
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    ubyte_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
ubyte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte accum = 0;

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_ubyte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_u8;
    npyv_u8 vaccum = npyv_zero_u8();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_u8 a0 = npyv_loada_u8(data0 + vstep * 0);
            npyv_u8 b0 = npyv_loada_u8(data1 + vstep * 0);
            
#line 501
            npyv_u8 a1 = npyv_loada_u8(data0 + vstep * 1);
            npyv_u8 b1 = npyv_loada_u8(data1 + vstep * 1);
            
#line 501
            npyv_u8 a2 = npyv_loada_u8(data0 + vstep * 2);
            npyv_u8 b2 = npyv_loada_u8(data1 + vstep * 2);
            
#line 501
            npyv_u8 a3 = npyv_loada_u8(data0 + vstep * 3);
            npyv_u8 b3 = npyv_loada_u8(data1 + vstep * 3);
            
            npyv_u8 ab3 = npyv_muladd_u8(a3, b3, vaccum);
            npyv_u8 ab2 = npyv_muladd_u8(a2, b2, ab3);
            npyv_u8 ab1 = npyv_muladd_u8(a1, b1, ab2);
                    vaccum = npyv_muladd_u8(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_u8 a0 = npyv_load_u8(data0 + vstep * 0);
            npyv_u8 b0 = npyv_load_u8(data1 + vstep * 0);
            
#line 501
            npyv_u8 a1 = npyv_load_u8(data0 + vstep * 1);
            npyv_u8 b1 = npyv_load_u8(data1 + vstep * 1);
            
#line 501
            npyv_u8 a2 = npyv_load_u8(data0 + vstep * 2);
            npyv_u8 b2 = npyv_load_u8(data1 + vstep * 2);
            
#line 501
            npyv_u8 a3 = npyv_load_u8(data0 + vstep * 3);
            npyv_u8 b3 = npyv_load_u8(data1 + vstep * 3);
            
            npyv_u8 ab3 = npyv_muladd_u8(a3, b3, vaccum);
            npyv_u8 ab2 = npyv_muladd_u8(a2, b2, ab3);
            npyv_u8 ab1 = npyv_muladd_u8(a1, b1, ab2);
                    vaccum = npyv_muladd_u8(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_u8 a = npyv_load_tillz_u8(data0, count);
        npyv_u8 b = npyv_load_tillz_u8(data1, count);
        vaccum = npyv_muladd_u8(a, b, vaccum);
    }
    accum = npyv_sum_u8(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_ubyte ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_ubyte ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_ubyte ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_ubyte ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_ubyte a = (*data0);
        const npy_ubyte b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_ubyte
    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
ubyte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte value0 = (*(npy_ubyte *)dataptr[0]);
    npy_ubyte accum = ubyte_sum_of_arr(data1, count);
    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
ubyte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte value1 = (*(npy_ubyte *)dataptr[1]);
    npy_ubyte accum = ubyte_sum_of_arr(data0, count);
    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + value1 * accum);
}

#elif 2 == 3 && !0

static void
ubyte_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte *data2 = (npy_ubyte *)dataptr[2];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 2 > 3 || @complex */

static void
ubyte_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ubyte *)dataptr[i]);
        }
        *(npy_ubyte *)dataptr[nop] = (temp +
                                           (*(npy_ubyte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_ubyte);
        }
#else /* complex */
#  if 2 <= 3
#    define _SUMPROD_NOP 2
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_ubyte re, im, tmp;
        int i;
        re = ((npy_ubyte *)dataptr[0])[0];
        im = ((npy_ubyte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
                  im * ((npy_ubyte *)dataptr[i])[1];
            im = re * ((npy_ubyte *)dataptr[i])[1] +
                 im * ((npy_ubyte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_ubyte);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 2 */

#if 2 == 1

static NPY_GCC_OPT_3 void
ubyte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_ubyte *data = (npy_ubyte *)dataptr[0];
    npy_ubyte accum = ubyte_sum_of_arr(data, count);
    *((npy_ubyte *)dataptr[1]) = (accum + (*((npy_ubyte *)dataptr[1])));
#else
    npy_ubyte accum_re = 0, accum_im = 0;
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_ubyte re01 = data0[0] + data0[2];
        const npy_ubyte re23 = data0[4] + data0[6];
        const npy_ubyte im13 = data0[1] + data0[3];
        const npy_ubyte im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_ubyte *)dataptr[1])[0] += accum_re;
    ((npy_ubyte *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 2 == 1 */

static void
ubyte_sum_of_products_outstride0_two(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_ubyte accum_re = 0, accum_im = 0;
#else
    npy_ubyte accum = 0;
#endif

#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_outstride0_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        accum += (*(npy_ubyte *)data0);
        data0 += stride0;
#  elif 2 == 2
        accum += (*(npy_ubyte *)data0) *
                 (*(npy_ubyte *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 2 == 3
        accum += (*(npy_ubyte *)data0) *
                 (*(npy_ubyte *)data1) *
                 (*(npy_ubyte *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ubyte *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        accum_re += ((npy_ubyte *)data0)[0];
        accum_im += ((npy_ubyte *)data0)[1];
        data0 += stride0;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_ubyte re, im, tmp;
        int i;
        re = ((npy_ubyte *)dataptr[0])[0];
        im = ((npy_ubyte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
                  im * ((npy_ubyte *)dataptr[i])[1];
            im = re * ((npy_ubyte *)dataptr[i])[1] +
                 im * ((npy_ubyte *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 2 <= 3
    ((npy_ubyte *)dataptr[2])[0] += accum_re;
    ((npy_ubyte *)dataptr[2])[1] += accum_im;
#  else
    ((npy_ubyte *)dataptr[nop])[0] += accum_re;
    ((npy_ubyte *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 2 <= 3
    *((npy_ubyte *)dataptr[2]) = (accum +
                                    (*((npy_ubyte *)dataptr[2])));
#  else
    *((npy_ubyte *)dataptr[nop]) = (accum +
                                    (*((npy_ubyte *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
ubyte_sum_of_products_three(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (3 == 1) || (3 <= 3 && !0)
    char *data_out = dataptr[3];
    npy_intp stride_out = strides[3];
#endif

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_three (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) +
                                         (*(npy_ubyte *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 3 == 2
        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) *
                                         (*(npy_ubyte *)data1) +
                                         (*(npy_ubyte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 3 == 3
        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) *
                                         (*(npy_ubyte *)data1) *
                                         (*(npy_ubyte *)data2) +
                                         (*(npy_ubyte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ubyte *)dataptr[i]);
        }
        *(npy_ubyte *)dataptr[nop] = (temp +
                                           (*(npy_ubyte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        ((npy_ubyte *)data_out)[0] = ((npy_ubyte *)data0)[0] +
                                         ((npy_ubyte *)data_out)[0];
        ((npy_ubyte *)data_out)[1] = ((npy_ubyte *)data0)[1] +
                                         ((npy_ubyte *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_ubyte re, im, tmp;
        int i;
        re = ((npy_ubyte *)dataptr[0])[0];
        im = ((npy_ubyte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
                  im * ((npy_ubyte *)dataptr[i])[1];
            im = re * ((npy_ubyte *)dataptr[i])[1] +
                 im * ((npy_ubyte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 3 == 1

static void
ubyte_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_ubyte *)data_out + 2*6)[0] =
                                    ((npy_ubyte *)data0 + 2*6)[0] +
                                    ((npy_ubyte *)data_out + 2*6)[0];
            ((npy_ubyte *)data_out + 2*6)[1] =
                                    ((npy_ubyte *)data0 + 2*6)[1] +
                                    ((npy_ubyte *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_ubyte *)data_out + 2*5)[0] =
                                    ((npy_ubyte *)data0 + 2*5)[0] +
                                    ((npy_ubyte *)data_out + 2*5)[0];
            ((npy_ubyte *)data_out + 2*5)[1] =
                                    ((npy_ubyte *)data0 + 2*5)[1] +
                                    ((npy_ubyte *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_ubyte *)data_out + 2*4)[0] =
                                    ((npy_ubyte *)data0 + 2*4)[0] +
                                    ((npy_ubyte *)data_out + 2*4)[0];
            ((npy_ubyte *)data_out + 2*4)[1] =
                                    ((npy_ubyte *)data0 + 2*4)[1] +
                                    ((npy_ubyte *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_ubyte *)data_out + 2*3)[0] =
                                    ((npy_ubyte *)data0 + 2*3)[0] +
                                    ((npy_ubyte *)data_out + 2*3)[0];
            ((npy_ubyte *)data_out + 2*3)[1] =
                                    ((npy_ubyte *)data0 + 2*3)[1] +
                                    ((npy_ubyte *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_ubyte *)data_out + 2*2)[0] =
                                    ((npy_ubyte *)data0 + 2*2)[0] +
                                    ((npy_ubyte *)data_out + 2*2)[0];
            ((npy_ubyte *)data_out + 2*2)[1] =
                                    ((npy_ubyte *)data0 + 2*2)[1] +
                                    ((npy_ubyte *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_ubyte *)data_out + 2*1)[0] =
                                    ((npy_ubyte *)data0 + 2*1)[0] +
                                    ((npy_ubyte *)data_out + 2*1)[0];
            ((npy_ubyte *)data_out + 2*1)[1] =
                                    ((npy_ubyte *)data0 + 2*1)[1] +
                                    ((npy_ubyte *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_ubyte *)data_out + 2*0)[0] =
                                    ((npy_ubyte *)data0 + 2*0)[0] +
                                    ((npy_ubyte *)data_out + 2*0)[0];
            ((npy_ubyte *)data_out + 2*0)[1] =
                                    ((npy_ubyte *)data0 + 2*0)[1] +
                                    ((npy_ubyte *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*0)[0] =
                                ((npy_ubyte *)data0 + 2*0)[0] +
                                ((npy_ubyte *)data_out + 2*0)[0];
        ((npy_ubyte *)data_out + 2*0)[1] =
                                ((npy_ubyte *)data0 + 2*0)[1] +
                                ((npy_ubyte *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*1)[0] =
                                ((npy_ubyte *)data0 + 2*1)[0] +
                                ((npy_ubyte *)data_out + 2*1)[0];
        ((npy_ubyte *)data_out + 2*1)[1] =
                                ((npy_ubyte *)data0 + 2*1)[1] +
                                ((npy_ubyte *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*2)[0] =
                                ((npy_ubyte *)data0 + 2*2)[0] +
                                ((npy_ubyte *)data_out + 2*2)[0];
        ((npy_ubyte *)data_out + 2*2)[1] =
                                ((npy_ubyte *)data0 + 2*2)[1] +
                                ((npy_ubyte *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*3)[0] =
                                ((npy_ubyte *)data0 + 2*3)[0] +
                                ((npy_ubyte *)data_out + 2*3)[0];
        ((npy_ubyte *)data_out + 2*3)[1] =
                                ((npy_ubyte *)data0 + 2*3)[1] +
                                ((npy_ubyte *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*4)[0] =
                                ((npy_ubyte *)data0 + 2*4)[0] +
                                ((npy_ubyte *)data_out + 2*4)[0];
        ((npy_ubyte *)data_out + 2*4)[1] =
                                ((npy_ubyte *)data0 + 2*4)[1] +
                                ((npy_ubyte *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*5)[0] =
                                ((npy_ubyte *)data0 + 2*5)[0] +
                                ((npy_ubyte *)data_out + 2*5)[0];
        ((npy_ubyte *)data_out + 2*5)[1] =
                                ((npy_ubyte *)data0 + 2*5)[1] +
                                ((npy_ubyte *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*6)[0] =
                                ((npy_ubyte *)data0 + 2*6)[0] +
                                ((npy_ubyte *)data_out + 2*6)[0];
        ((npy_ubyte *)data_out + 2*6)[1] =
                                ((npy_ubyte *)data0 + 2*6)[1] +
                                ((npy_ubyte *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*7)[0] =
                                ((npy_ubyte *)data0 + 2*7)[0] +
                                ((npy_ubyte *)data_out + 2*7)[0];
        ((npy_ubyte *)data_out + 2*7)[1] =
                                ((npy_ubyte *)data0 + 2*7)[1] +
                                ((npy_ubyte *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 3 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
ubyte_sum_of_products_muladd(npy_ubyte *data, npy_ubyte *data_out, npy_ubyte scalar, npy_intp count)
{
#if 0 // NPYV check for npy_ubyte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_u8;
    const npyv_u8 v_scalar = npyv_setall_u8(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_u8 b0 = npyv_loada_u8(data + vstep * 0);
            npyv_u8 c0 = npyv_loada_u8(data_out + vstep * 0);
            
#line 312
            npyv_u8 b1 = npyv_loada_u8(data + vstep * 1);
            npyv_u8 c1 = npyv_loada_u8(data_out + vstep * 1);
            
#line 312
            npyv_u8 b2 = npyv_loada_u8(data + vstep * 2);
            npyv_u8 c2 = npyv_loada_u8(data_out + vstep * 2);
            
#line 312
            npyv_u8 b3 = npyv_loada_u8(data + vstep * 3);
            npyv_u8 c3 = npyv_loada_u8(data_out + vstep * 3);
            
            #line 318
            npyv_u8 abc0 = npyv_muladd_u8(v_scalar, b0, c0);
            
#line 318
            npyv_u8 abc1 = npyv_muladd_u8(v_scalar, b1, c1);
            
#line 318
            npyv_u8 abc2 = npyv_muladd_u8(v_scalar, b2, c2);
            
#line 318
            npyv_u8 abc3 = npyv_muladd_u8(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_u8(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_u8(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_u8(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_u8(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_u8 b0 = npyv_load_u8(data + vstep * 0);
            npyv_u8 c0 = npyv_load_u8(data_out + vstep * 0);
            
#line 312
            npyv_u8 b1 = npyv_load_u8(data + vstep * 1);
            npyv_u8 c1 = npyv_load_u8(data_out + vstep * 1);
            
#line 312
            npyv_u8 b2 = npyv_load_u8(data + vstep * 2);
            npyv_u8 c2 = npyv_load_u8(data_out + vstep * 2);
            
#line 312
            npyv_u8 b3 = npyv_load_u8(data + vstep * 3);
            npyv_u8 c3 = npyv_load_u8(data_out + vstep * 3);
            
            #line 318
            npyv_u8 abc0 = npyv_muladd_u8(v_scalar, b0, c0);
            
#line 318
            npyv_u8 abc1 = npyv_muladd_u8(v_scalar, b1, c1);
            
#line 318
            npyv_u8 abc2 = npyv_muladd_u8(v_scalar, b2, c2);
            
#line 318
            npyv_u8 abc3 = npyv_muladd_u8(v_scalar, b3, c3);
            
            #line 323
            npyv_store_u8(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_u8(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_u8(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_u8(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_u8 a = npyv_load_tillz_u8(data, count);
        npyv_u8 b = npyv_load_tillz_u8(data_out, count);
        npyv_store_till_u8(data_out, count, npyv_muladd_u8(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_ubyte b0 = (data[0]);
        const npy_ubyte c0 = (data_out[0]);
        
#line 340
        const npy_ubyte b1 = (data[1]);
        const npy_ubyte c1 = (data_out[1]);
        
#line 340
        const npy_ubyte b2 = (data[2]);
        const npy_ubyte c2 = (data_out[2]);
        
#line 340
        const npy_ubyte b3 = (data[3]);
        const npy_ubyte c3 = (data_out[3]);
        
        #line 346
        const npy_ubyte abc0 = scalar * b0 + c0;
        
#line 346
        const npy_ubyte abc1 = scalar * b1 + c1;
        
#line 346
        const npy_ubyte abc2 = scalar * b2 + c2;
        
#line 346
        const npy_ubyte abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_ubyte b = (*data);
        const npy_ubyte c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_ubyte
}

static void
ubyte_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_ubyte
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_u8;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_u8 a0 = npyv_loada_u8(data0 + vstep * 0);
            npyv_u8 b0 = npyv_loada_u8(data1 + vstep * 0);
            npyv_u8 c0 = npyv_loada_u8(data_out + vstep * 0);
            
#line 390
            npyv_u8 a1 = npyv_loada_u8(data0 + vstep * 1);
            npyv_u8 b1 = npyv_loada_u8(data1 + vstep * 1);
            npyv_u8 c1 = npyv_loada_u8(data_out + vstep * 1);
            
#line 390
            npyv_u8 a2 = npyv_loada_u8(data0 + vstep * 2);
            npyv_u8 b2 = npyv_loada_u8(data1 + vstep * 2);
            npyv_u8 c2 = npyv_loada_u8(data_out + vstep * 2);
            
#line 390
            npyv_u8 a3 = npyv_loada_u8(data0 + vstep * 3);
            npyv_u8 b3 = npyv_loada_u8(data1 + vstep * 3);
            npyv_u8 c3 = npyv_loada_u8(data_out + vstep * 3);
            
            #line 397
            npyv_u8 abc0 = npyv_muladd_u8(a0, b0, c0);
            
#line 397
            npyv_u8 abc1 = npyv_muladd_u8(a1, b1, c1);
            
#line 397
            npyv_u8 abc2 = npyv_muladd_u8(a2, b2, c2);
            
#line 397
            npyv_u8 abc3 = npyv_muladd_u8(a3, b3, c3);
            
            #line 402
            npyv_storea_u8(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_u8(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_u8(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_u8(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_u8 a0 = npyv_load_u8(data0 + vstep * 0);
            npyv_u8 b0 = npyv_load_u8(data1 + vstep * 0);
            npyv_u8 c0 = npyv_load_u8(data_out + vstep * 0);
            
#line 390
            npyv_u8 a1 = npyv_load_u8(data0 + vstep * 1);
            npyv_u8 b1 = npyv_load_u8(data1 + vstep * 1);
            npyv_u8 c1 = npyv_load_u8(data_out + vstep * 1);
            
#line 390
            npyv_u8 a2 = npyv_load_u8(data0 + vstep * 2);
            npyv_u8 b2 = npyv_load_u8(data1 + vstep * 2);
            npyv_u8 c2 = npyv_load_u8(data_out + vstep * 2);
            
#line 390
            npyv_u8 a3 = npyv_load_u8(data0 + vstep * 3);
            npyv_u8 b3 = npyv_load_u8(data1 + vstep * 3);
            npyv_u8 c3 = npyv_load_u8(data_out + vstep * 3);
            
            #line 397
            npyv_u8 abc0 = npyv_muladd_u8(a0, b0, c0);
            
#line 397
            npyv_u8 abc1 = npyv_muladd_u8(a1, b1, c1);
            
#line 397
            npyv_u8 abc2 = npyv_muladd_u8(a2, b2, c2);
            
#line 397
            npyv_u8 abc3 = npyv_muladd_u8(a3, b3, c3);
            
            #line 402
            npyv_store_u8(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_u8(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_u8(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_u8(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_u8 a = npyv_load_tillz_u8(data0, count);
        npyv_u8 b = npyv_load_tillz_u8(data1, count);
        npyv_u8 c = npyv_load_tillz_u8(data_out, count);
        npyv_store_till_u8(data_out, count, npyv_muladd_u8(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_ubyte a0 = (data0[0]);
        const npy_ubyte b0 = (data1[0]);
        const npy_ubyte c0 = (data_out[0]);
        
#line 420
        const npy_ubyte a1 = (data0[1]);
        const npy_ubyte b1 = (data1[1]);
        const npy_ubyte c1 = (data_out[1]);
        
#line 420
        const npy_ubyte a2 = (data0[2]);
        const npy_ubyte b2 = (data1[2]);
        const npy_ubyte c2 = (data_out[2]);
        
#line 420
        const npy_ubyte a3 = (data0[3]);
        const npy_ubyte b3 = (data1[3]);
        const npy_ubyte c3 = (data_out[3]);
        
        #line 427
        const npy_ubyte abc0 = a0 * b0 + c0;
        
#line 427
        const npy_ubyte abc1 = a1 * b1 + c1;
        
#line 427
        const npy_ubyte abc2 = a2 * b2 + c2;
        
#line 427
        const npy_ubyte abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_ubyte a = (*data0);
        const npy_ubyte b = (*data1);
        const npy_ubyte c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_ubyte

}

/* Some extra specializations for the two operand case */
static void
ubyte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte value0 = (*(npy_ubyte *)dataptr[0]);
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    ubyte_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
ubyte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte value1 = (*(npy_ubyte *)dataptr[1]);
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    ubyte_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
ubyte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte accum = 0;

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_ubyte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_u8;
    npyv_u8 vaccum = npyv_zero_u8();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_u8 a0 = npyv_loada_u8(data0 + vstep * 0);
            npyv_u8 b0 = npyv_loada_u8(data1 + vstep * 0);
            
#line 501
            npyv_u8 a1 = npyv_loada_u8(data0 + vstep * 1);
            npyv_u8 b1 = npyv_loada_u8(data1 + vstep * 1);
            
#line 501
            npyv_u8 a2 = npyv_loada_u8(data0 + vstep * 2);
            npyv_u8 b2 = npyv_loada_u8(data1 + vstep * 2);
            
#line 501
            npyv_u8 a3 = npyv_loada_u8(data0 + vstep * 3);
            npyv_u8 b3 = npyv_loada_u8(data1 + vstep * 3);
            
            npyv_u8 ab3 = npyv_muladd_u8(a3, b3, vaccum);
            npyv_u8 ab2 = npyv_muladd_u8(a2, b2, ab3);
            npyv_u8 ab1 = npyv_muladd_u8(a1, b1, ab2);
                    vaccum = npyv_muladd_u8(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_u8 a0 = npyv_load_u8(data0 + vstep * 0);
            npyv_u8 b0 = npyv_load_u8(data1 + vstep * 0);
            
#line 501
            npyv_u8 a1 = npyv_load_u8(data0 + vstep * 1);
            npyv_u8 b1 = npyv_load_u8(data1 + vstep * 1);
            
#line 501
            npyv_u8 a2 = npyv_load_u8(data0 + vstep * 2);
            npyv_u8 b2 = npyv_load_u8(data1 + vstep * 2);
            
#line 501
            npyv_u8 a3 = npyv_load_u8(data0 + vstep * 3);
            npyv_u8 b3 = npyv_load_u8(data1 + vstep * 3);
            
            npyv_u8 ab3 = npyv_muladd_u8(a3, b3, vaccum);
            npyv_u8 ab2 = npyv_muladd_u8(a2, b2, ab3);
            npyv_u8 ab1 = npyv_muladd_u8(a1, b1, ab2);
                    vaccum = npyv_muladd_u8(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_u8 a = npyv_load_tillz_u8(data0, count);
        npyv_u8 b = npyv_load_tillz_u8(data1, count);
        vaccum = npyv_muladd_u8(a, b, vaccum);
    }
    accum = npyv_sum_u8(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_ubyte ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_ubyte ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_ubyte ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_ubyte ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_ubyte a = (*data0);
        const npy_ubyte b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_ubyte
    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
ubyte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte value0 = (*(npy_ubyte *)dataptr[0]);
    npy_ubyte accum = ubyte_sum_of_arr(data1, count);
    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
ubyte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte value1 = (*(npy_ubyte *)dataptr[1]);
    npy_ubyte accum = ubyte_sum_of_arr(data0, count);
    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + value1 * accum);
}

#elif 3 == 3 && !0

static void
ubyte_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte *data2 = (npy_ubyte *)dataptr[2];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 3 > 3 || @complex */

static void
ubyte_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ubyte *)dataptr[i]);
        }
        *(npy_ubyte *)dataptr[nop] = (temp +
                                           (*(npy_ubyte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_ubyte);
        }
#else /* complex */
#  if 3 <= 3
#    define _SUMPROD_NOP 3
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_ubyte re, im, tmp;
        int i;
        re = ((npy_ubyte *)dataptr[0])[0];
        im = ((npy_ubyte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
                  im * ((npy_ubyte *)dataptr[i])[1];
            im = re * ((npy_ubyte *)dataptr[i])[1] +
                 im * ((npy_ubyte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_ubyte);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 3 */

#if 3 == 1

static NPY_GCC_OPT_3 void
ubyte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_ubyte *data = (npy_ubyte *)dataptr[0];
    npy_ubyte accum = ubyte_sum_of_arr(data, count);
    *((npy_ubyte *)dataptr[1]) = (accum + (*((npy_ubyte *)dataptr[1])));
#else
    npy_ubyte accum_re = 0, accum_im = 0;
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_ubyte re01 = data0[0] + data0[2];
        const npy_ubyte re23 = data0[4] + data0[6];
        const npy_ubyte im13 = data0[1] + data0[3];
        const npy_ubyte im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_ubyte *)dataptr[1])[0] += accum_re;
    ((npy_ubyte *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 3 == 1 */

static void
ubyte_sum_of_products_outstride0_three(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_ubyte accum_re = 0, accum_im = 0;
#else
    npy_ubyte accum = 0;
#endif

#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_outstride0_three (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        accum += (*(npy_ubyte *)data0);
        data0 += stride0;
#  elif 3 == 2
        accum += (*(npy_ubyte *)data0) *
                 (*(npy_ubyte *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 3 == 3
        accum += (*(npy_ubyte *)data0) *
                 (*(npy_ubyte *)data1) *
                 (*(npy_ubyte *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ubyte *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        accum_re += ((npy_ubyte *)data0)[0];
        accum_im += ((npy_ubyte *)data0)[1];
        data0 += stride0;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_ubyte re, im, tmp;
        int i;
        re = ((npy_ubyte *)dataptr[0])[0];
        im = ((npy_ubyte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
                  im * ((npy_ubyte *)dataptr[i])[1];
            im = re * ((npy_ubyte *)dataptr[i])[1] +
                 im * ((npy_ubyte *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 3 <= 3
    ((npy_ubyte *)dataptr[3])[0] += accum_re;
    ((npy_ubyte *)dataptr[3])[1] += accum_im;
#  else
    ((npy_ubyte *)dataptr[nop])[0] += accum_re;
    ((npy_ubyte *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 3 <= 3
    *((npy_ubyte *)dataptr[3]) = (accum +
                                    (*((npy_ubyte *)dataptr[3])));
#  else
    *((npy_ubyte *)dataptr[nop]) = (accum +
                                    (*((npy_ubyte *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
ubyte_sum_of_products_any(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1000 == 1) || (1000 <= 3 && !0)
    char *data_out = dataptr[1000];
    npy_intp stride_out = strides[1000];
#endif

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_any (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) +
                                         (*(npy_ubyte *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1000 == 2
        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) *
                                         (*(npy_ubyte *)data1) +
                                         (*(npy_ubyte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1000 == 3
        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) *
                                         (*(npy_ubyte *)data1) *
                                         (*(npy_ubyte *)data2) +
                                         (*(npy_ubyte *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ubyte *)dataptr[i]);
        }
        *(npy_ubyte *)dataptr[nop] = (temp +
                                           (*(npy_ubyte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        ((npy_ubyte *)data_out)[0] = ((npy_ubyte *)data0)[0] +
                                         ((npy_ubyte *)data_out)[0];
        ((npy_ubyte *)data_out)[1] = ((npy_ubyte *)data0)[1] +
                                         ((npy_ubyte *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_ubyte re, im, tmp;
        int i;
        re = ((npy_ubyte *)dataptr[0])[0];
        im = ((npy_ubyte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
                  im * ((npy_ubyte *)dataptr[i])[1];
            im = re * ((npy_ubyte *)dataptr[i])[1] +
                 im * ((npy_ubyte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1000 == 1

static void
ubyte_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_ubyte *)data_out + 2*6)[0] =
                                    ((npy_ubyte *)data0 + 2*6)[0] +
                                    ((npy_ubyte *)data_out + 2*6)[0];
            ((npy_ubyte *)data_out + 2*6)[1] =
                                    ((npy_ubyte *)data0 + 2*6)[1] +
                                    ((npy_ubyte *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_ubyte *)data_out + 2*5)[0] =
                                    ((npy_ubyte *)data0 + 2*5)[0] +
                                    ((npy_ubyte *)data_out + 2*5)[0];
            ((npy_ubyte *)data_out + 2*5)[1] =
                                    ((npy_ubyte *)data0 + 2*5)[1] +
                                    ((npy_ubyte *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_ubyte *)data_out + 2*4)[0] =
                                    ((npy_ubyte *)data0 + 2*4)[0] +
                                    ((npy_ubyte *)data_out + 2*4)[0];
            ((npy_ubyte *)data_out + 2*4)[1] =
                                    ((npy_ubyte *)data0 + 2*4)[1] +
                                    ((npy_ubyte *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_ubyte *)data_out + 2*3)[0] =
                                    ((npy_ubyte *)data0 + 2*3)[0] +
                                    ((npy_ubyte *)data_out + 2*3)[0];
            ((npy_ubyte *)data_out + 2*3)[1] =
                                    ((npy_ubyte *)data0 + 2*3)[1] +
                                    ((npy_ubyte *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_ubyte *)data_out + 2*2)[0] =
                                    ((npy_ubyte *)data0 + 2*2)[0] +
                                    ((npy_ubyte *)data_out + 2*2)[0];
            ((npy_ubyte *)data_out + 2*2)[1] =
                                    ((npy_ubyte *)data0 + 2*2)[1] +
                                    ((npy_ubyte *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_ubyte *)data_out + 2*1)[0] =
                                    ((npy_ubyte *)data0 + 2*1)[0] +
                                    ((npy_ubyte *)data_out + 2*1)[0];
            ((npy_ubyte *)data_out + 2*1)[1] =
                                    ((npy_ubyte *)data0 + 2*1)[1] +
                                    ((npy_ubyte *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_ubyte *)data_out + 2*0)[0] =
                                    ((npy_ubyte *)data0 + 2*0)[0] +
                                    ((npy_ubyte *)data_out + 2*0)[0];
            ((npy_ubyte *)data_out + 2*0)[1] =
                                    ((npy_ubyte *)data0 + 2*0)[1] +
                                    ((npy_ubyte *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*0)[0] =
                                ((npy_ubyte *)data0 + 2*0)[0] +
                                ((npy_ubyte *)data_out + 2*0)[0];
        ((npy_ubyte *)data_out + 2*0)[1] =
                                ((npy_ubyte *)data0 + 2*0)[1] +
                                ((npy_ubyte *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*1)[0] =
                                ((npy_ubyte *)data0 + 2*1)[0] +
                                ((npy_ubyte *)data_out + 2*1)[0];
        ((npy_ubyte *)data_out + 2*1)[1] =
                                ((npy_ubyte *)data0 + 2*1)[1] +
                                ((npy_ubyte *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*2)[0] =
                                ((npy_ubyte *)data0 + 2*2)[0] +
                                ((npy_ubyte *)data_out + 2*2)[0];
        ((npy_ubyte *)data_out + 2*2)[1] =
                                ((npy_ubyte *)data0 + 2*2)[1] +
                                ((npy_ubyte *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*3)[0] =
                                ((npy_ubyte *)data0 + 2*3)[0] +
                                ((npy_ubyte *)data_out + 2*3)[0];
        ((npy_ubyte *)data_out + 2*3)[1] =
                                ((npy_ubyte *)data0 + 2*3)[1] +
                                ((npy_ubyte *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*4)[0] =
                                ((npy_ubyte *)data0 + 2*4)[0] +
                                ((npy_ubyte *)data_out + 2*4)[0];
        ((npy_ubyte *)data_out + 2*4)[1] =
                                ((npy_ubyte *)data0 + 2*4)[1] +
                                ((npy_ubyte *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*5)[0] =
                                ((npy_ubyte *)data0 + 2*5)[0] +
                                ((npy_ubyte *)data_out + 2*5)[0];
        ((npy_ubyte *)data_out + 2*5)[1] =
                                ((npy_ubyte *)data0 + 2*5)[1] +
                                ((npy_ubyte *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*6)[0] =
                                ((npy_ubyte *)data0 + 2*6)[0] +
                                ((npy_ubyte *)data_out + 2*6)[0];
        ((npy_ubyte *)data_out + 2*6)[1] =
                                ((npy_ubyte *)data0 + 2*6)[1] +
                                ((npy_ubyte *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_ubyte *)data_out + 2*7)[0] =
                                ((npy_ubyte *)data0 + 2*7)[0] +
                                ((npy_ubyte *)data_out + 2*7)[0];
        ((npy_ubyte *)data_out + 2*7)[1] =
                                ((npy_ubyte *)data0 + 2*7)[1] +
                                ((npy_ubyte *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1000 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
ubyte_sum_of_products_muladd(npy_ubyte *data, npy_ubyte *data_out, npy_ubyte scalar, npy_intp count)
{
#if 0 // NPYV check for npy_ubyte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_u8;
    const npyv_u8 v_scalar = npyv_setall_u8(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_u8 b0 = npyv_loada_u8(data + vstep * 0);
            npyv_u8 c0 = npyv_loada_u8(data_out + vstep * 0);
            
#line 312
            npyv_u8 b1 = npyv_loada_u8(data + vstep * 1);
            npyv_u8 c1 = npyv_loada_u8(data_out + vstep * 1);
            
#line 312
            npyv_u8 b2 = npyv_loada_u8(data + vstep * 2);
            npyv_u8 c2 = npyv_loada_u8(data_out + vstep * 2);
            
#line 312
            npyv_u8 b3 = npyv_loada_u8(data + vstep * 3);
            npyv_u8 c3 = npyv_loada_u8(data_out + vstep * 3);
            
            #line 318
            npyv_u8 abc0 = npyv_muladd_u8(v_scalar, b0, c0);
            
#line 318
            npyv_u8 abc1 = npyv_muladd_u8(v_scalar, b1, c1);
            
#line 318
            npyv_u8 abc2 = npyv_muladd_u8(v_scalar, b2, c2);
            
#line 318
            npyv_u8 abc3 = npyv_muladd_u8(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_u8(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_u8(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_u8(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_u8(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_u8 b0 = npyv_load_u8(data + vstep * 0);
            npyv_u8 c0 = npyv_load_u8(data_out + vstep * 0);
            
#line 312
            npyv_u8 b1 = npyv_load_u8(data + vstep * 1);
            npyv_u8 c1 = npyv_load_u8(data_out + vstep * 1);
            
#line 312
            npyv_u8 b2 = npyv_load_u8(data + vstep * 2);
            npyv_u8 c2 = npyv_load_u8(data_out + vstep * 2);
            
#line 312
            npyv_u8 b3 = npyv_load_u8(data + vstep * 3);
            npyv_u8 c3 = npyv_load_u8(data_out + vstep * 3);
            
            #line 318
            npyv_u8 abc0 = npyv_muladd_u8(v_scalar, b0, c0);
            
#line 318
            npyv_u8 abc1 = npyv_muladd_u8(v_scalar, b1, c1);
            
#line 318
            npyv_u8 abc2 = npyv_muladd_u8(v_scalar, b2, c2);
            
#line 318
            npyv_u8 abc3 = npyv_muladd_u8(v_scalar, b3, c3);
            
            #line 323
            npyv_store_u8(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_u8(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_u8(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_u8(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_u8 a = npyv_load_tillz_u8(data, count);
        npyv_u8 b = npyv_load_tillz_u8(data_out, count);
        npyv_store_till_u8(data_out, count, npyv_muladd_u8(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_ubyte b0 = (data[0]);
        const npy_ubyte c0 = (data_out[0]);
        
#line 340
        const npy_ubyte b1 = (data[1]);
        const npy_ubyte c1 = (data_out[1]);
        
#line 340
        const npy_ubyte b2 = (data[2]);
        const npy_ubyte c2 = (data_out[2]);
        
#line 340
        const npy_ubyte b3 = (data[3]);
        const npy_ubyte c3 = (data_out[3]);
        
        #line 346
        const npy_ubyte abc0 = scalar * b0 + c0;
        
#line 346
        const npy_ubyte abc1 = scalar * b1 + c1;
        
#line 346
        const npy_ubyte abc2 = scalar * b2 + c2;
        
#line 346
        const npy_ubyte abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_ubyte b = (*data);
        const npy_ubyte c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_ubyte
}

static void
ubyte_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_ubyte
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_u8;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_u8 a0 = npyv_loada_u8(data0 + vstep * 0);
            npyv_u8 b0 = npyv_loada_u8(data1 + vstep * 0);
            npyv_u8 c0 = npyv_loada_u8(data_out + vstep * 0);
            
#line 390
            npyv_u8 a1 = npyv_loada_u8(data0 + vstep * 1);
            npyv_u8 b1 = npyv_loada_u8(data1 + vstep * 1);
            npyv_u8 c1 = npyv_loada_u8(data_out + vstep * 1);
            
#line 390
            npyv_u8 a2 = npyv_loada_u8(data0 + vstep * 2);
            npyv_u8 b2 = npyv_loada_u8(data1 + vstep * 2);
            npyv_u8 c2 = npyv_loada_u8(data_out + vstep * 2);
            
#line 390
            npyv_u8 a3 = npyv_loada_u8(data0 + vstep * 3);
            npyv_u8 b3 = npyv_loada_u8(data1 + vstep * 3);
            npyv_u8 c3 = npyv_loada_u8(data_out + vstep * 3);
            
            #line 397
            npyv_u8 abc0 = npyv_muladd_u8(a0, b0, c0);
            
#line 397
            npyv_u8 abc1 = npyv_muladd_u8(a1, b1, c1);
            
#line 397
            npyv_u8 abc2 = npyv_muladd_u8(a2, b2, c2);
            
#line 397
            npyv_u8 abc3 = npyv_muladd_u8(a3, b3, c3);
            
            #line 402
            npyv_storea_u8(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_u8(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_u8(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_u8(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_u8 a0 = npyv_load_u8(data0 + vstep * 0);
            npyv_u8 b0 = npyv_load_u8(data1 + vstep * 0);
            npyv_u8 c0 = npyv_load_u8(data_out + vstep * 0);
            
#line 390
            npyv_u8 a1 = npyv_load_u8(data0 + vstep * 1);
            npyv_u8 b1 = npyv_load_u8(data1 + vstep * 1);
            npyv_u8 c1 = npyv_load_u8(data_out + vstep * 1);
            
#line 390
            npyv_u8 a2 = npyv_load_u8(data0 + vstep * 2);
            npyv_u8 b2 = npyv_load_u8(data1 + vstep * 2);
            npyv_u8 c2 = npyv_load_u8(data_out + vstep * 2);
            
#line 390
            npyv_u8 a3 = npyv_load_u8(data0 + vstep * 3);
            npyv_u8 b3 = npyv_load_u8(data1 + vstep * 3);
            npyv_u8 c3 = npyv_load_u8(data_out + vstep * 3);
            
            #line 397
            npyv_u8 abc0 = npyv_muladd_u8(a0, b0, c0);
            
#line 397
            npyv_u8 abc1 = npyv_muladd_u8(a1, b1, c1);
            
#line 397
            npyv_u8 abc2 = npyv_muladd_u8(a2, b2, c2);
            
#line 397
            npyv_u8 abc3 = npyv_muladd_u8(a3, b3, c3);
            
            #line 402
            npyv_store_u8(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_u8(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_u8(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_u8(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_u8 a = npyv_load_tillz_u8(data0, count);
        npyv_u8 b = npyv_load_tillz_u8(data1, count);
        npyv_u8 c = npyv_load_tillz_u8(data_out, count);
        npyv_store_till_u8(data_out, count, npyv_muladd_u8(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_ubyte a0 = (data0[0]);
        const npy_ubyte b0 = (data1[0]);
        const npy_ubyte c0 = (data_out[0]);
        
#line 420
        const npy_ubyte a1 = (data0[1]);
        const npy_ubyte b1 = (data1[1]);
        const npy_ubyte c1 = (data_out[1]);
        
#line 420
        const npy_ubyte a2 = (data0[2]);
        const npy_ubyte b2 = (data1[2]);
        const npy_ubyte c2 = (data_out[2]);
        
#line 420
        const npy_ubyte a3 = (data0[3]);
        const npy_ubyte b3 = (data1[3]);
        const npy_ubyte c3 = (data_out[3]);
        
        #line 427
        const npy_ubyte abc0 = a0 * b0 + c0;
        
#line 427
        const npy_ubyte abc1 = a1 * b1 + c1;
        
#line 427
        const npy_ubyte abc2 = a2 * b2 + c2;
        
#line 427
        const npy_ubyte abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_ubyte a = (*data0);
        const npy_ubyte b = (*data1);
        const npy_ubyte c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_ubyte

}

/* Some extra specializations for the two operand case */
static void
ubyte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte value0 = (*(npy_ubyte *)dataptr[0]);
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    ubyte_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
ubyte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte value1 = (*(npy_ubyte *)dataptr[1]);
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    ubyte_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
ubyte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte accum = 0;

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_ubyte
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_u8;
    npyv_u8 vaccum = npyv_zero_u8();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_u8 a0 = npyv_loada_u8(data0 + vstep * 0);
            npyv_u8 b0 = npyv_loada_u8(data1 + vstep * 0);
            
#line 501
            npyv_u8 a1 = npyv_loada_u8(data0 + vstep * 1);
            npyv_u8 b1 = npyv_loada_u8(data1 + vstep * 1);
            
#line 501
            npyv_u8 a2 = npyv_loada_u8(data0 + vstep * 2);
            npyv_u8 b2 = npyv_loada_u8(data1 + vstep * 2);
            
#line 501
            npyv_u8 a3 = npyv_loada_u8(data0 + vstep * 3);
            npyv_u8 b3 = npyv_loada_u8(data1 + vstep * 3);
            
            npyv_u8 ab3 = npyv_muladd_u8(a3, b3, vaccum);
            npyv_u8 ab2 = npyv_muladd_u8(a2, b2, ab3);
            npyv_u8 ab1 = npyv_muladd_u8(a1, b1, ab2);
                    vaccum = npyv_muladd_u8(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_u8 a0 = npyv_load_u8(data0 + vstep * 0);
            npyv_u8 b0 = npyv_load_u8(data1 + vstep * 0);
            
#line 501
            npyv_u8 a1 = npyv_load_u8(data0 + vstep * 1);
            npyv_u8 b1 = npyv_load_u8(data1 + vstep * 1);
            
#line 501
            npyv_u8 a2 = npyv_load_u8(data0 + vstep * 2);
            npyv_u8 b2 = npyv_load_u8(data1 + vstep * 2);
            
#line 501
            npyv_u8 a3 = npyv_load_u8(data0 + vstep * 3);
            npyv_u8 b3 = npyv_load_u8(data1 + vstep * 3);
            
            npyv_u8 ab3 = npyv_muladd_u8(a3, b3, vaccum);
            npyv_u8 ab2 = npyv_muladd_u8(a2, b2, ab3);
            npyv_u8 ab1 = npyv_muladd_u8(a1, b1, ab2);
                    vaccum = npyv_muladd_u8(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_u8 a = npyv_load_tillz_u8(data0, count);
        npyv_u8 b = npyv_load_tillz_u8(data1, count);
        vaccum = npyv_muladd_u8(a, b, vaccum);
    }
    accum = npyv_sum_u8(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_ubyte ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_ubyte ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_ubyte ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_ubyte ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_ubyte a = (*data0);
        const npy_ubyte b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_ubyte
    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
ubyte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte value0 = (*(npy_ubyte *)dataptr[0]);
    npy_ubyte accum = ubyte_sum_of_arr(data1, count);
    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
ubyte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte value1 = (*(npy_ubyte *)dataptr[1]);
    npy_ubyte accum = ubyte_sum_of_arr(data0, count);
    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + value1 * accum);
}

#elif 1000 == 3 && !0

static void
ubyte_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
    npy_ubyte *data2 = (npy_ubyte *)dataptr[2];
    npy_ubyte *data_out = (npy_ubyte *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1000 > 3 || @complex */

static void
ubyte_sum_of_products_contig_any(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ubyte *)dataptr[i]);
        }
        *(npy_ubyte *)dataptr[nop] = (temp +
                                           (*(npy_ubyte *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_ubyte);
        }
#else /* complex */
#  if 1000 <= 3
#    define _SUMPROD_NOP 1000
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_ubyte re, im, tmp;
        int i;
        re = ((npy_ubyte *)dataptr[0])[0];
        im = ((npy_ubyte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
                  im * ((npy_ubyte *)dataptr[i])[1];
            im = re * ((npy_ubyte *)dataptr[i])[1] +
                 im * ((npy_ubyte *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0];
        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_ubyte);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1000 */

#if 1000 == 1

static NPY_GCC_OPT_3 void
ubyte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_ubyte *data = (npy_ubyte *)dataptr[0];
    npy_ubyte accum = ubyte_sum_of_arr(data, count);
    *((npy_ubyte *)dataptr[1]) = (accum + (*((npy_ubyte *)dataptr[1])));
#else
    npy_ubyte accum_re = 0, accum_im = 0;
    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_ubyte re01 = data0[0] + data0[2];
        const npy_ubyte re23 = data0[4] + data0[6];
        const npy_ubyte im13 = data0[1] + data0[3];
        const npy_ubyte im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_ubyte *)dataptr[1])[0] += accum_re;
    ((npy_ubyte *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 1000 == 1 */

static void
ubyte_sum_of_products_outstride0_any(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_ubyte accum_re = 0, accum_im = 0;
#else
    npy_ubyte accum = 0;
#endif

#if (1000 == 1) || (1000 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1000 == 2 || 1000 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1000 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_outstride0_any (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1000 == 1
        accum += (*(npy_ubyte *)data0);
        data0 += stride0;
#  elif 1000 == 2
        accum += (*(npy_ubyte *)data0) *
                 (*(npy_ubyte *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1000 == 3
        accum += (*(npy_ubyte *)data0) *
                 (*(npy_ubyte *)data1) *
                 (*(npy_ubyte *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ubyte *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1000 == 1
        accum_re += ((npy_ubyte *)data0)[0];
        accum_im += ((npy_ubyte *)data0)[1];
        data0 += stride0;
#  else
#    if 1000 <= 3
#define _SUMPROD_NOP 1000
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_ubyte re, im, tmp;
        int i;
        re = ((npy_ubyte *)dataptr[0])[0];
        im = ((npy_ubyte *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
                  im * ((npy_ubyte *)dataptr[i])[1];
            im = re * ((npy_ubyte *)dataptr[i])[1] +
                 im * ((npy_ubyte *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1000 <= 3
    ((npy_ubyte *)dataptr[1000])[0] += accum_re;
    ((npy_ubyte *)dataptr[1000])[1] += accum_im;
#  else
    ((npy_ubyte *)dataptr[nop])[0] += accum_re;
    ((npy_ubyte *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1000 <= 3
    *((npy_ubyte *)dataptr[1000]) = (accum +
                                    (*((npy_ubyte *)dataptr[1000])));
#  else
    *((npy_ubyte *)dataptr[nop]) = (accum +
                                    (*((npy_ubyte *)dataptr[nop])));
#  endif
#endif

}




#line 74

#if !0
static NPY_GCC_OPT_3 npy_ushort ushort_sum_of_arr(npy_ushort *data, npy_intp count)
{
    npy_ushort accum = 0;
#if 0 // NPYV check for npy_ushort
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data);
    const int vstep = npyv_nlanes_u16;
    npyv_u16 vaccum = npyv_zero_u16();
    const npy_intp vstepx4 = vstep * 4;

    #line 91
    if(is_aligned) {
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
            #line 96
            npyv_u16 a0 = npyv_loada_u16(data + vstep * 0);
            
#line 96
            npyv_u16 a1 = npyv_loada_u16(data + vstep * 1);
            
#line 96
            npyv_u16 a2 = npyv_loada_u16(data + vstep * 2);
            
#line 96
            npyv_u16 a3 = npyv_loada_u16(data + vstep * 3);
            
            npyv_u16 a01   = npyv_add_u16(a0, a1);
            npyv_u16 a23   = npyv_add_u16(a2, a3);
            npyv_u16 a0123 = npyv_add_u16(a01, a23);
                      vaccum = npyv_add_u16(a0123, vaccum);
        }
    }
    
#line 91
    else {
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
            #line 96
            npyv_u16 a0 = npyv_load_u16(data + vstep * 0);
            
#line 96
            npyv_u16 a1 = npyv_load_u16(data + vstep * 1);
            
#line 96
            npyv_u16 a2 = npyv_load_u16(data + vstep * 2);
            
#line 96
            npyv_u16 a3 = npyv_load_u16(data + vstep * 3);
            
            npyv_u16 a01   = npyv_add_u16(a0, a1);
            npyv_u16 a23   = npyv_add_u16(a2, a3);
            npyv_u16 a0123 = npyv_add_u16(a01, a23);
                      vaccum = npyv_add_u16(a0123, vaccum);
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep) {
        npyv_u16 a = npyv_load_tillz_u16(data, count);
        vaccum = npyv_add_u16(a, vaccum);
    }
    accum = npyv_sum_u16(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data += 4) {
        const npy_ushort a01 = (*data) + (data[1]);
        const npy_ushort a23 = (data[2]) + (data[3]);
        accum +=  a01 + a23;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data++) {
        accum += (*data);
    }
#endif // NPYV check for npy_ushort
    return accum;
}
#endif

#line 131
static void
ushort_sum_of_products_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (1 == 1) || (1 <= 3 && !0)
    char *data_out = dataptr[1];
    npy_intp stride_out = strides[1];
#endif

    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_one (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) +
                                         (*(npy_ushort *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 1 == 2
        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) *
                                         (*(npy_ushort *)data1) +
                                         (*(npy_ushort *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 1 == 3
        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) *
                                         (*(npy_ushort *)data1) *
                                         (*(npy_ushort *)data2) +
                                         (*(npy_ushort *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ushort *)dataptr[i]);
        }
        *(npy_ushort *)dataptr[nop] = (temp +
                                           (*(npy_ushort *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        ((npy_ushort *)data_out)[0] = ((npy_ushort *)data0)[0] +
                                         ((npy_ushort *)data_out)[0];
        ((npy_ushort *)data_out)[1] = ((npy_ushort *)data0)[1] +
                                         ((npy_ushort *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_ushort re, im, tmp;
        int i;
        re = ((npy_ushort *)dataptr[0])[0];
        im = ((npy_ushort *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ushort *)dataptr[i])[0] -
                  im * ((npy_ushort *)dataptr[i])[1];
            im = re * ((npy_ushort *)dataptr[i])[1] +
                 im * ((npy_ushort *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_ushort *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[0];
        ((npy_ushort *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 1 == 1

static void
ushort_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort *data0 = (npy_ushort *)dataptr[0];
    npy_ushort *data_out = (npy_ushort *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_ushort *)data_out + 2*6)[0] =
                                    ((npy_ushort *)data0 + 2*6)[0] +
                                    ((npy_ushort *)data_out + 2*6)[0];
            ((npy_ushort *)data_out + 2*6)[1] =
                                    ((npy_ushort *)data0 + 2*6)[1] +
                                    ((npy_ushort *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_ushort *)data_out + 2*5)[0] =
                                    ((npy_ushort *)data0 + 2*5)[0] +
                                    ((npy_ushort *)data_out + 2*5)[0];
            ((npy_ushort *)data_out + 2*5)[1] =
                                    ((npy_ushort *)data0 + 2*5)[1] +
                                    ((npy_ushort *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_ushort *)data_out + 2*4)[0] =
                                    ((npy_ushort *)data0 + 2*4)[0] +
                                    ((npy_ushort *)data_out + 2*4)[0];
            ((npy_ushort *)data_out + 2*4)[1] =
                                    ((npy_ushort *)data0 + 2*4)[1] +
                                    ((npy_ushort *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_ushort *)data_out + 2*3)[0] =
                                    ((npy_ushort *)data0 + 2*3)[0] +
                                    ((npy_ushort *)data_out + 2*3)[0];
            ((npy_ushort *)data_out + 2*3)[1] =
                                    ((npy_ushort *)data0 + 2*3)[1] +
                                    ((npy_ushort *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_ushort *)data_out + 2*2)[0] =
                                    ((npy_ushort *)data0 + 2*2)[0] +
                                    ((npy_ushort *)data_out + 2*2)[0];
            ((npy_ushort *)data_out + 2*2)[1] =
                                    ((npy_ushort *)data0 + 2*2)[1] +
                                    ((npy_ushort *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_ushort *)data_out + 2*1)[0] =
                                    ((npy_ushort *)data0 + 2*1)[0] +
                                    ((npy_ushort *)data_out + 2*1)[0];
            ((npy_ushort *)data_out + 2*1)[1] =
                                    ((npy_ushort *)data0 + 2*1)[1] +
                                    ((npy_ushort *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_ushort *)data_out + 2*0)[0] =
                                    ((npy_ushort *)data0 + 2*0)[0] +
                                    ((npy_ushort *)data_out + 2*0)[0];
            ((npy_ushort *)data_out + 2*0)[1] =
                                    ((npy_ushort *)data0 + 2*0)[1] +
                                    ((npy_ushort *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_ushort *)data_out + 2*0)[0] =
                                ((npy_ushort *)data0 + 2*0)[0] +
                                ((npy_ushort *)data_out + 2*0)[0];
        ((npy_ushort *)data_out + 2*0)[1] =
                                ((npy_ushort *)data0 + 2*0)[1] +
                                ((npy_ushort *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_ushort *)data_out + 2*1)[0] =
                                ((npy_ushort *)data0 + 2*1)[0] +
                                ((npy_ushort *)data_out + 2*1)[0];
        ((npy_ushort *)data_out + 2*1)[1] =
                                ((npy_ushort *)data0 + 2*1)[1] +
                                ((npy_ushort *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_ushort *)data_out + 2*2)[0] =
                                ((npy_ushort *)data0 + 2*2)[0] +
                                ((npy_ushort *)data_out + 2*2)[0];
        ((npy_ushort *)data_out + 2*2)[1] =
                                ((npy_ushort *)data0 + 2*2)[1] +
                                ((npy_ushort *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_ushort *)data_out + 2*3)[0] =
                                ((npy_ushort *)data0 + 2*3)[0] +
                                ((npy_ushort *)data_out + 2*3)[0];
        ((npy_ushort *)data_out + 2*3)[1] =
                                ((npy_ushort *)data0 + 2*3)[1] +
                                ((npy_ushort *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_ushort *)data_out + 2*4)[0] =
                                ((npy_ushort *)data0 + 2*4)[0] +
                                ((npy_ushort *)data_out + 2*4)[0];
        ((npy_ushort *)data_out + 2*4)[1] =
                                ((npy_ushort *)data0 + 2*4)[1] +
                                ((npy_ushort *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_ushort *)data_out + 2*5)[0] =
                                ((npy_ushort *)data0 + 2*5)[0] +
                                ((npy_ushort *)data_out + 2*5)[0];
        ((npy_ushort *)data_out + 2*5)[1] =
                                ((npy_ushort *)data0 + 2*5)[1] +
                                ((npy_ushort *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_ushort *)data_out + 2*6)[0] =
                                ((npy_ushort *)data0 + 2*6)[0] +
                                ((npy_ushort *)data_out + 2*6)[0];
        ((npy_ushort *)data_out + 2*6)[1] =
                                ((npy_ushort *)data0 + 2*6)[1] +
                                ((npy_ushort *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_ushort *)data_out + 2*7)[0] =
                                ((npy_ushort *)data0 + 2*7)[0] +
                                ((npy_ushort *)data_out + 2*7)[0];
        ((npy_ushort *)data_out + 2*7)[1] =
                                ((npy_ushort *)data0 + 2*7)[1] +
                                ((npy_ushort *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 1 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
ushort_sum_of_products_muladd(npy_ushort *data, npy_ushort *data_out, npy_ushort scalar, npy_intp count)
{
#if 0 // NPYV check for npy_ushort
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_u16;
    const npyv_u16 v_scalar = npyv_setall_u16(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_u16 b0 = npyv_loada_u16(data + vstep * 0);
            npyv_u16 c0 = npyv_loada_u16(data_out + vstep * 0);
            
#line 312
            npyv_u16 b1 = npyv_loada_u16(data + vstep * 1);
            npyv_u16 c1 = npyv_loada_u16(data_out + vstep * 1);
            
#line 312
            npyv_u16 b2 = npyv_loada_u16(data + vstep * 2);
            npyv_u16 c2 = npyv_loada_u16(data_out + vstep * 2);
            
#line 312
            npyv_u16 b3 = npyv_loada_u16(data + vstep * 3);
            npyv_u16 c3 = npyv_loada_u16(data_out + vstep * 3);
            
            #line 318
            npyv_u16 abc0 = npyv_muladd_u16(v_scalar, b0, c0);
            
#line 318
            npyv_u16 abc1 = npyv_muladd_u16(v_scalar, b1, c1);
            
#line 318
            npyv_u16 abc2 = npyv_muladd_u16(v_scalar, b2, c2);
            
#line 318
            npyv_u16 abc3 = npyv_muladd_u16(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_u16(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_u16(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_u16(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_u16(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_u16 b0 = npyv_load_u16(data + vstep * 0);
            npyv_u16 c0 = npyv_load_u16(data_out + vstep * 0);
            
#line 312
            npyv_u16 b1 = npyv_load_u16(data + vstep * 1);
            npyv_u16 c1 = npyv_load_u16(data_out + vstep * 1);
            
#line 312
            npyv_u16 b2 = npyv_load_u16(data + vstep * 2);
            npyv_u16 c2 = npyv_load_u16(data_out + vstep * 2);
            
#line 312
            npyv_u16 b3 = npyv_load_u16(data + vstep * 3);
            npyv_u16 c3 = npyv_load_u16(data_out + vstep * 3);
            
            #line 318
            npyv_u16 abc0 = npyv_muladd_u16(v_scalar, b0, c0);
            
#line 318
            npyv_u16 abc1 = npyv_muladd_u16(v_scalar, b1, c1);
            
#line 318
            npyv_u16 abc2 = npyv_muladd_u16(v_scalar, b2, c2);
            
#line 318
            npyv_u16 abc3 = npyv_muladd_u16(v_scalar, b3, c3);
            
            #line 323
            npyv_store_u16(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_u16(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_u16(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_u16(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_u16 a = npyv_load_tillz_u16(data, count);
        npyv_u16 b = npyv_load_tillz_u16(data_out, count);
        npyv_store_till_u16(data_out, count, npyv_muladd_u16(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_ushort b0 = (data[0]);
        const npy_ushort c0 = (data_out[0]);
        
#line 340
        const npy_ushort b1 = (data[1]);
        const npy_ushort c1 = (data_out[1]);
        
#line 340
        const npy_ushort b2 = (data[2]);
        const npy_ushort c2 = (data_out[2]);
        
#line 340
        const npy_ushort b3 = (data[3]);
        const npy_ushort c3 = (data_out[3]);
        
        #line 346
        const npy_ushort abc0 = scalar * b0 + c0;
        
#line 346
        const npy_ushort abc1 = scalar * b1 + c1;
        
#line 346
        const npy_ushort abc2 = scalar * b2 + c2;
        
#line 346
        const npy_ushort abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_ushort b = (*data);
        const npy_ushort c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_ushort
}

static void
ushort_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort *data0 = (npy_ushort *)dataptr[0];
    npy_ushort *data1 = (npy_ushort *)dataptr[1];
    npy_ushort *data_out = (npy_ushort *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_ushort
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_u16;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_u16 a0 = npyv_loada_u16(data0 + vstep * 0);
            npyv_u16 b0 = npyv_loada_u16(data1 + vstep * 0);
            npyv_u16 c0 = npyv_loada_u16(data_out + vstep * 0);
            
#line 390
            npyv_u16 a1 = npyv_loada_u16(data0 + vstep * 1);
            npyv_u16 b1 = npyv_loada_u16(data1 + vstep * 1);
            npyv_u16 c1 = npyv_loada_u16(data_out + vstep * 1);
            
#line 390
            npyv_u16 a2 = npyv_loada_u16(data0 + vstep * 2);
            npyv_u16 b2 = npyv_loada_u16(data1 + vstep * 2);
            npyv_u16 c2 = npyv_loada_u16(data_out + vstep * 2);
            
#line 390
            npyv_u16 a3 = npyv_loada_u16(data0 + vstep * 3);
            npyv_u16 b3 = npyv_loada_u16(data1 + vstep * 3);
            npyv_u16 c3 = npyv_loada_u16(data_out + vstep * 3);
            
            #line 397
            npyv_u16 abc0 = npyv_muladd_u16(a0, b0, c0);
            
#line 397
            npyv_u16 abc1 = npyv_muladd_u16(a1, b1, c1);
            
#line 397
            npyv_u16 abc2 = npyv_muladd_u16(a2, b2, c2);
            
#line 397
            npyv_u16 abc3 = npyv_muladd_u16(a3, b3, c3);
            
            #line 402
            npyv_storea_u16(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_u16(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_u16(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_u16(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_u16 a0 = npyv_load_u16(data0 + vstep * 0);
            npyv_u16 b0 = npyv_load_u16(data1 + vstep * 0);
            npyv_u16 c0 = npyv_load_u16(data_out + vstep * 0);
            
#line 390
            npyv_u16 a1 = npyv_load_u16(data0 + vstep * 1);
            npyv_u16 b1 = npyv_load_u16(data1 + vstep * 1);
            npyv_u16 c1 = npyv_load_u16(data_out + vstep * 1);
            
#line 390
            npyv_u16 a2 = npyv_load_u16(data0 + vstep * 2);
            npyv_u16 b2 = npyv_load_u16(data1 + vstep * 2);
            npyv_u16 c2 = npyv_load_u16(data_out + vstep * 2);
            
#line 390
            npyv_u16 a3 = npyv_load_u16(data0 + vstep * 3);
            npyv_u16 b3 = npyv_load_u16(data1 + vstep * 3);
            npyv_u16 c3 = npyv_load_u16(data_out + vstep * 3);
            
            #line 397
            npyv_u16 abc0 = npyv_muladd_u16(a0, b0, c0);
            
#line 397
            npyv_u16 abc1 = npyv_muladd_u16(a1, b1, c1);
            
#line 397
            npyv_u16 abc2 = npyv_muladd_u16(a2, b2, c2);
            
#line 397
            npyv_u16 abc3 = npyv_muladd_u16(a3, b3, c3);
            
            #line 402
            npyv_store_u16(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_u16(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_u16(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_u16(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_u16 a = npyv_load_tillz_u16(data0, count);
        npyv_u16 b = npyv_load_tillz_u16(data1, count);
        npyv_u16 c = npyv_load_tillz_u16(data_out, count);
        npyv_store_till_u16(data_out, count, npyv_muladd_u16(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_ushort a0 = (data0[0]);
        const npy_ushort b0 = (data1[0]);
        const npy_ushort c0 = (data_out[0]);
        
#line 420
        const npy_ushort a1 = (data0[1]);
        const npy_ushort b1 = (data1[1]);
        const npy_ushort c1 = (data_out[1]);
        
#line 420
        const npy_ushort a2 = (data0[2]);
        const npy_ushort b2 = (data1[2]);
        const npy_ushort c2 = (data_out[2]);
        
#line 420
        const npy_ushort a3 = (data0[3]);
        const npy_ushort b3 = (data1[3]);
        const npy_ushort c3 = (data_out[3]);
        
        #line 427
        const npy_ushort abc0 = a0 * b0 + c0;
        
#line 427
        const npy_ushort abc1 = a1 * b1 + c1;
        
#line 427
        const npy_ushort abc2 = a2 * b2 + c2;
        
#line 427
        const npy_ushort abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_ushort a = (*data0);
        const npy_ushort b = (*data1);
        const npy_ushort c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_ushort

}

/* Some extra specializations for the two operand case */
static void
ushort_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort value0 = (*(npy_ushort *)dataptr[0]);
    npy_ushort *data1 = (npy_ushort *)dataptr[1];
    npy_ushort *data_out = (npy_ushort *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    ushort_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
ushort_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort value1 = (*(npy_ushort *)dataptr[1]);
    npy_ushort *data0 = (npy_ushort *)dataptr[0];
    npy_ushort *data_out = (npy_ushort *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    ushort_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
ushort_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort *data0 = (npy_ushort *)dataptr[0];
    npy_ushort *data1 = (npy_ushort *)dataptr[1];
    npy_ushort accum = 0;

    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_ushort
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_u16;
    npyv_u16 vaccum = npyv_zero_u16();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_u16 a0 = npyv_loada_u16(data0 + vstep * 0);
            npyv_u16 b0 = npyv_loada_u16(data1 + vstep * 0);
            
#line 501
            npyv_u16 a1 = npyv_loada_u16(data0 + vstep * 1);
            npyv_u16 b1 = npyv_loada_u16(data1 + vstep * 1);
            
#line 501
            npyv_u16 a2 = npyv_loada_u16(data0 + vstep * 2);
            npyv_u16 b2 = npyv_loada_u16(data1 + vstep * 2);
            
#line 501
            npyv_u16 a3 = npyv_loada_u16(data0 + vstep * 3);
            npyv_u16 b3 = npyv_loada_u16(data1 + vstep * 3);
            
            npyv_u16 ab3 = npyv_muladd_u16(a3, b3, vaccum);
            npyv_u16 ab2 = npyv_muladd_u16(a2, b2, ab3);
            npyv_u16 ab1 = npyv_muladd_u16(a1, b1, ab2);
                    vaccum = npyv_muladd_u16(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_u16 a0 = npyv_load_u16(data0 + vstep * 0);
            npyv_u16 b0 = npyv_load_u16(data1 + vstep * 0);
            
#line 501
            npyv_u16 a1 = npyv_load_u16(data0 + vstep * 1);
            npyv_u16 b1 = npyv_load_u16(data1 + vstep * 1);
            
#line 501
            npyv_u16 a2 = npyv_load_u16(data0 + vstep * 2);
            npyv_u16 b2 = npyv_load_u16(data1 + vstep * 2);
            
#line 501
            npyv_u16 a3 = npyv_load_u16(data0 + vstep * 3);
            npyv_u16 b3 = npyv_load_u16(data1 + vstep * 3);
            
            npyv_u16 ab3 = npyv_muladd_u16(a3, b3, vaccum);
            npyv_u16 ab2 = npyv_muladd_u16(a2, b2, ab3);
            npyv_u16 ab1 = npyv_muladd_u16(a1, b1, ab2);
                    vaccum = npyv_muladd_u16(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_u16 a = npyv_load_tillz_u16(data0, count);
        npyv_u16 b = npyv_load_tillz_u16(data1, count);
        vaccum = npyv_muladd_u16(a, b, vaccum);
    }
    accum = npyv_sum_u16(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_ushort ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_ushort ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_ushort ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_ushort ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_ushort a = (*data0);
        const npy_ushort b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_ushort
    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
ushort_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort *data1 = (npy_ushort *)dataptr[1];
    npy_ushort value0 = (*(npy_ushort *)dataptr[0]);
    npy_ushort accum = ushort_sum_of_arr(data1, count);
    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
ushort_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort *data0 = (npy_ushort *)dataptr[0];
    npy_ushort value1 = (*(npy_ushort *)dataptr[1]);
    npy_ushort accum = ushort_sum_of_arr(data0, count);
    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + value1 * accum);
}

#elif 1 == 3 && !0

static void
ushort_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort *data0 = (npy_ushort *)dataptr[0];
    npy_ushort *data1 = (npy_ushort *)dataptr[1];
    npy_ushort *data2 = (npy_ushort *)dataptr[2];
    npy_ushort *data_out = (npy_ushort *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 1 > 3 || @complex */

static void
ushort_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ushort *)dataptr[i]);
        }
        *(npy_ushort *)dataptr[nop] = (temp +
                                           (*(npy_ushort *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_ushort);
        }
#else /* complex */
#  if 1 <= 3
#    define _SUMPROD_NOP 1
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_ushort re, im, tmp;
        int i;
        re = ((npy_ushort *)dataptr[0])[0];
        im = ((npy_ushort *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ushort *)dataptr[i])[0] -
                  im * ((npy_ushort *)dataptr[i])[1];
            im = re * ((npy_ushort *)dataptr[i])[1] +
                 im * ((npy_ushort *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_ushort *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[0];
        ((npy_ushort *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_ushort);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 1 */

#if 1 == 1

static NPY_GCC_OPT_3 void
ushort_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_ushort *data = (npy_ushort *)dataptr[0];
    npy_ushort accum = ushort_sum_of_arr(data, count);
    *((npy_ushort *)dataptr[1]) = (accum + (*((npy_ushort *)dataptr[1])));
#else
    npy_ushort accum_re = 0, accum_im = 0;
    npy_ushort *data0 = (npy_ushort *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_ushort re01 = data0[0] + data0[2];
        const npy_ushort re23 = data0[4] + data0[6];
        const npy_ushort im13 = data0[1] + data0[3];
        const npy_ushort im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_ushort *)dataptr[1])[0] += accum_re;
    ((npy_ushort *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 1 == 1 */

static void
ushort_sum_of_products_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_ushort accum_re = 0, accum_im = 0;
#else
    npy_ushort accum = 0;
#endif

#if (1 == 1) || (1 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (1 == 2 || 1 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (1 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_outstride0_one (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 1 == 1
        accum += (*(npy_ushort *)data0);
        data0 += stride0;
#  elif 1 == 2
        accum += (*(npy_ushort *)data0) *
                 (*(npy_ushort *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 1 == 3
        accum += (*(npy_ushort *)data0) *
                 (*(npy_ushort *)data1) *
                 (*(npy_ushort *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ushort *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 1 == 1
        accum_re += ((npy_ushort *)data0)[0];
        accum_im += ((npy_ushort *)data0)[1];
        data0 += stride0;
#  else
#    if 1 <= 3
#define _SUMPROD_NOP 1
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_ushort re, im, tmp;
        int i;
        re = ((npy_ushort *)dataptr[0])[0];
        im = ((npy_ushort *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ushort *)dataptr[i])[0] -
                  im * ((npy_ushort *)dataptr[i])[1];
            im = re * ((npy_ushort *)dataptr[i])[1] +
                 im * ((npy_ushort *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 1 <= 3
    ((npy_ushort *)dataptr[1])[0] += accum_re;
    ((npy_ushort *)dataptr[1])[1] += accum_im;
#  else
    ((npy_ushort *)dataptr[nop])[0] += accum_re;
    ((npy_ushort *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 1 <= 3
    *((npy_ushort *)dataptr[1]) = (accum +
                                    (*((npy_ushort *)dataptr[1])));
#  else
    *((npy_ushort *)dataptr[nop]) = (accum +
                                    (*((npy_ushort *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
ushort_sum_of_products_two(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (2 == 1) || (2 <= 3 && !0)
    char *data_out = dataptr[2];
    npy_intp stride_out = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_two (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) +
                                         (*(npy_ushort *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 2 == 2
        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) *
                                         (*(npy_ushort *)data1) +
                                         (*(npy_ushort *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 2 == 3
        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) *
                                         (*(npy_ushort *)data1) *
                                         (*(npy_ushort *)data2) +
                                         (*(npy_ushort *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ushort *)dataptr[i]);
        }
        *(npy_ushort *)dataptr[nop] = (temp +
                                           (*(npy_ushort *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        ((npy_ushort *)data_out)[0] = ((npy_ushort *)data0)[0] +
                                         ((npy_ushort *)data_out)[0];
        ((npy_ushort *)data_out)[1] = ((npy_ushort *)data0)[1] +
                                         ((npy_ushort *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_ushort re, im, tmp;
        int i;
        re = ((npy_ushort *)dataptr[0])[0];
        im = ((npy_ushort *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ushort *)dataptr[i])[0] -
                  im * ((npy_ushort *)dataptr[i])[1];
            im = re * ((npy_ushort *)dataptr[i])[1] +
                 im * ((npy_ushort *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_ushort *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[0];
        ((npy_ushort *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 2 == 1

static void
ushort_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort *data0 = (npy_ushort *)dataptr[0];
    npy_ushort *data_out = (npy_ushort *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_ushort *)data_out + 2*6)[0] =
                                    ((npy_ushort *)data0 + 2*6)[0] +
                                    ((npy_ushort *)data_out + 2*6)[0];
            ((npy_ushort *)data_out + 2*6)[1] =
                                    ((npy_ushort *)data0 + 2*6)[1] +
                                    ((npy_ushort *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_ushort *)data_out + 2*5)[0] =
                                    ((npy_ushort *)data0 + 2*5)[0] +
                                    ((npy_ushort *)data_out + 2*5)[0];
            ((npy_ushort *)data_out + 2*5)[1] =
                                    ((npy_ushort *)data0 + 2*5)[1] +
                                    ((npy_ushort *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_ushort *)data_out + 2*4)[0] =
                                    ((npy_ushort *)data0 + 2*4)[0] +
                                    ((npy_ushort *)data_out + 2*4)[0];
            ((npy_ushort *)data_out + 2*4)[1] =
                                    ((npy_ushort *)data0 + 2*4)[1] +
                                    ((npy_ushort *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_ushort *)data_out + 2*3)[0] =
                                    ((npy_ushort *)data0 + 2*3)[0] +
                                    ((npy_ushort *)data_out + 2*3)[0];
            ((npy_ushort *)data_out + 2*3)[1] =
                                    ((npy_ushort *)data0 + 2*3)[1] +
                                    ((npy_ushort *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_ushort *)data_out + 2*2)[0] =
                                    ((npy_ushort *)data0 + 2*2)[0] +
                                    ((npy_ushort *)data_out + 2*2)[0];
            ((npy_ushort *)data_out + 2*2)[1] =
                                    ((npy_ushort *)data0 + 2*2)[1] +
                                    ((npy_ushort *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_ushort *)data_out + 2*1)[0] =
                                    ((npy_ushort *)data0 + 2*1)[0] +
                                    ((npy_ushort *)data_out + 2*1)[0];
            ((npy_ushort *)data_out + 2*1)[1] =
                                    ((npy_ushort *)data0 + 2*1)[1] +
                                    ((npy_ushort *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_ushort *)data_out + 2*0)[0] =
                                    ((npy_ushort *)data0 + 2*0)[0] +
                                    ((npy_ushort *)data_out + 2*0)[0];
            ((npy_ushort *)data_out + 2*0)[1] =
                                    ((npy_ushort *)data0 + 2*0)[1] +
                                    ((npy_ushort *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_ushort *)data_out + 2*0)[0] =
                                ((npy_ushort *)data0 + 2*0)[0] +
                                ((npy_ushort *)data_out + 2*0)[0];
        ((npy_ushort *)data_out + 2*0)[1] =
                                ((npy_ushort *)data0 + 2*0)[1] +
                                ((npy_ushort *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_ushort *)data_out + 2*1)[0] =
                                ((npy_ushort *)data0 + 2*1)[0] +
                                ((npy_ushort *)data_out + 2*1)[0];
        ((npy_ushort *)data_out + 2*1)[1] =
                                ((npy_ushort *)data0 + 2*1)[1] +
                                ((npy_ushort *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_ushort *)data_out + 2*2)[0] =
                                ((npy_ushort *)data0 + 2*2)[0] +
                                ((npy_ushort *)data_out + 2*2)[0];
        ((npy_ushort *)data_out + 2*2)[1] =
                                ((npy_ushort *)data0 + 2*2)[1] +
                                ((npy_ushort *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_ushort *)data_out + 2*3)[0] =
                                ((npy_ushort *)data0 + 2*3)[0] +
                                ((npy_ushort *)data_out + 2*3)[0];
        ((npy_ushort *)data_out + 2*3)[1] =
                                ((npy_ushort *)data0 + 2*3)[1] +
                                ((npy_ushort *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_ushort *)data_out + 2*4)[0] =
                                ((npy_ushort *)data0 + 2*4)[0] +
                                ((npy_ushort *)data_out + 2*4)[0];
        ((npy_ushort *)data_out + 2*4)[1] =
                                ((npy_ushort *)data0 + 2*4)[1] +
                                ((npy_ushort *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_ushort *)data_out + 2*5)[0] =
                                ((npy_ushort *)data0 + 2*5)[0] +
                                ((npy_ushort *)data_out + 2*5)[0];
        ((npy_ushort *)data_out + 2*5)[1] =
                                ((npy_ushort *)data0 + 2*5)[1] +
                                ((npy_ushort *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_ushort *)data_out + 2*6)[0] =
                                ((npy_ushort *)data0 + 2*6)[0] +
                                ((npy_ushort *)data_out + 2*6)[0];
        ((npy_ushort *)data_out + 2*6)[1] =
                                ((npy_ushort *)data0 + 2*6)[1] +
                                ((npy_ushort *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_ushort *)data_out + 2*7)[0] =
                                ((npy_ushort *)data0 + 2*7)[0] +
                                ((npy_ushort *)data_out + 2*7)[0];
        ((npy_ushort *)data_out + 2*7)[1] =
                                ((npy_ushort *)data0 + 2*7)[1] +
                                ((npy_ushort *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 2 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
ushort_sum_of_products_muladd(npy_ushort *data, npy_ushort *data_out, npy_ushort scalar, npy_intp count)
{
#if 0 // NPYV check for npy_ushort
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_u16;
    const npyv_u16 v_scalar = npyv_setall_u16(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_u16 b0 = npyv_loada_u16(data + vstep * 0);
            npyv_u16 c0 = npyv_loada_u16(data_out + vstep * 0);
            
#line 312
            npyv_u16 b1 = npyv_loada_u16(data + vstep * 1);
            npyv_u16 c1 = npyv_loada_u16(data_out + vstep * 1);
            
#line 312
            npyv_u16 b2 = npyv_loada_u16(data + vstep * 2);
            npyv_u16 c2 = npyv_loada_u16(data_out + vstep * 2);
            
#line 312
            npyv_u16 b3 = npyv_loada_u16(data + vstep * 3);
            npyv_u16 c3 = npyv_loada_u16(data_out + vstep * 3);
            
            #line 318
            npyv_u16 abc0 = npyv_muladd_u16(v_scalar, b0, c0);
            
#line 318
            npyv_u16 abc1 = npyv_muladd_u16(v_scalar, b1, c1);
            
#line 318
            npyv_u16 abc2 = npyv_muladd_u16(v_scalar, b2, c2);
            
#line 318
            npyv_u16 abc3 = npyv_muladd_u16(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_u16(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_u16(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_u16(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_u16(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_u16 b0 = npyv_load_u16(data + vstep * 0);
            npyv_u16 c0 = npyv_load_u16(data_out + vstep * 0);
            
#line 312
            npyv_u16 b1 = npyv_load_u16(data + vstep * 1);
            npyv_u16 c1 = npyv_load_u16(data_out + vstep * 1);
            
#line 312
            npyv_u16 b2 = npyv_load_u16(data + vstep * 2);
            npyv_u16 c2 = npyv_load_u16(data_out + vstep * 2);
            
#line 312
            npyv_u16 b3 = npyv_load_u16(data + vstep * 3);
            npyv_u16 c3 = npyv_load_u16(data_out + vstep * 3);
            
            #line 318
            npyv_u16 abc0 = npyv_muladd_u16(v_scalar, b0, c0);
            
#line 318
            npyv_u16 abc1 = npyv_muladd_u16(v_scalar, b1, c1);
            
#line 318
            npyv_u16 abc2 = npyv_muladd_u16(v_scalar, b2, c2);
            
#line 318
            npyv_u16 abc3 = npyv_muladd_u16(v_scalar, b3, c3);
            
            #line 323
            npyv_store_u16(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_u16(data_out + vstep * 1, abc1);
            
#line 323
            npyv_store_u16(data_out + vstep * 2, abc2);
            
#line 323
            npyv_store_u16(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
        npyv_u16 a = npyv_load_tillz_u16(data, count);
        npyv_u16 b = npyv_load_tillz_u16(data_out, count);
        npyv_store_till_u16(data_out, count, npyv_muladd_u16(a, v_scalar, b));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
        #line 340
        const npy_ushort b0 = (data[0]);
        const npy_ushort c0 = (data_out[0]);
        
#line 340
        const npy_ushort b1 = (data[1]);
        const npy_ushort c1 = (data_out[1]);
        
#line 340
        const npy_ushort b2 = (data[2]);
        const npy_ushort c2 = (data_out[2]);
        
#line 340
        const npy_ushort b3 = (data[3]);
        const npy_ushort c3 = (data_out[3]);
        
        #line 346
        const npy_ushort abc0 = scalar * b0 + c0;
        
#line 346
        const npy_ushort abc1 = scalar * b1 + c1;
        
#line 346
        const npy_ushort abc2 = scalar * b2 + c2;
        
#line 346
        const npy_ushort abc3 = scalar * b3 + c3;
        
        #line 351
        data_out[0] = (abc0);
        
#line 351
        data_out[1] = (abc1);
        
#line 351
        data_out[2] = (abc2);
        
#line 351
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data, ++data_out) {
        const npy_ushort b = (*data);
        const npy_ushort c = (*data_out);
        *data_out = (scalar * b + c);
    }
#endif // NPYV check for npy_ushort
}

static void
ushort_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort *data0 = (npy_ushort *)dataptr[0];
    npy_ushort *data1 = (npy_ushort *)dataptr[1];
    npy_ushort *data_out = (npy_ushort *)dataptr[2];
    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_two (%d)\n",
                                                            (int)count);
    // NPYV check for npy_ushort
#if 0
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
                        EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_u16;

    #line 384
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_u16 a0 = npyv_loada_u16(data0 + vstep * 0);
            npyv_u16 b0 = npyv_loada_u16(data1 + vstep * 0);
            npyv_u16 c0 = npyv_loada_u16(data_out + vstep * 0);
            
#line 390
            npyv_u16 a1 = npyv_loada_u16(data0 + vstep * 1);
            npyv_u16 b1 = npyv_loada_u16(data1 + vstep * 1);
            npyv_u16 c1 = npyv_loada_u16(data_out + vstep * 1);
            
#line 390
            npyv_u16 a2 = npyv_loada_u16(data0 + vstep * 2);
            npyv_u16 b2 = npyv_loada_u16(data1 + vstep * 2);
            npyv_u16 c2 = npyv_loada_u16(data_out + vstep * 2);
            
#line 390
            npyv_u16 a3 = npyv_loada_u16(data0 + vstep * 3);
            npyv_u16 b3 = npyv_loada_u16(data1 + vstep * 3);
            npyv_u16 c3 = npyv_loada_u16(data_out + vstep * 3);
            
            #line 397
            npyv_u16 abc0 = npyv_muladd_u16(a0, b0, c0);
            
#line 397
            npyv_u16 abc1 = npyv_muladd_u16(a1, b1, c1);
            
#line 397
            npyv_u16 abc2 = npyv_muladd_u16(a2, b2, c2);
            
#line 397
            npyv_u16 abc3 = npyv_muladd_u16(a3, b3, c3);
            
            #line 402
            npyv_storea_u16(data_out + vstep * 0, abc0);
            
#line 402
            npyv_storea_u16(data_out + vstep * 1, abc1);
            
#line 402
            npyv_storea_u16(data_out + vstep * 2, abc2);
            
#line 402
            npyv_storea_u16(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 384
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
            #line 390
            npyv_u16 a0 = npyv_load_u16(data0 + vstep * 0);
            npyv_u16 b0 = npyv_load_u16(data1 + vstep * 0);
            npyv_u16 c0 = npyv_load_u16(data_out + vstep * 0);
            
#line 390
            npyv_u16 a1 = npyv_load_u16(data0 + vstep * 1);
            npyv_u16 b1 = npyv_load_u16(data1 + vstep * 1);
            npyv_u16 c1 = npyv_load_u16(data_out + vstep * 1);
            
#line 390
            npyv_u16 a2 = npyv_load_u16(data0 + vstep * 2);
            npyv_u16 b2 = npyv_load_u16(data1 + vstep * 2);
            npyv_u16 c2 = npyv_load_u16(data_out + vstep * 2);
            
#line 390
            npyv_u16 a3 = npyv_load_u16(data0 + vstep * 3);
            npyv_u16 b3 = npyv_load_u16(data1 + vstep * 3);
            npyv_u16 c3 = npyv_load_u16(data_out + vstep * 3);
            
            #line 397
            npyv_u16 abc0 = npyv_muladd_u16(a0, b0, c0);
            
#line 397
            npyv_u16 abc1 = npyv_muladd_u16(a1, b1, c1);
            
#line 397
            npyv_u16 abc2 = npyv_muladd_u16(a2, b2, c2);
            
#line 397
            npyv_u16 abc3 = npyv_muladd_u16(a3, b3, c3);
            
            #line 402
            npyv_store_u16(data_out + vstep * 0, abc0);
            
#line 402
            npyv_store_u16(data_out + vstep * 1, abc1);
            
#line 402
            npyv_store_u16(data_out + vstep * 2, abc2);
            
#line 402
            npyv_store_u16(data_out + vstep * 3, abc3);
            
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
        npyv_u16 a = npyv_load_tillz_u16(data0, count);
        npyv_u16 b = npyv_load_tillz_u16(data1, count);
        npyv_u16 c = npyv_load_tillz_u16(data_out, count);
        npyv_store_till_u16(data_out, count, npyv_muladd_u16(a, b, c));
    }
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
        #line 420
        const npy_ushort a0 = (data0[0]);
        const npy_ushort b0 = (data1[0]);
        const npy_ushort c0 = (data_out[0]);
        
#line 420
        const npy_ushort a1 = (data0[1]);
        const npy_ushort b1 = (data1[1]);
        const npy_ushort c1 = (data_out[1]);
        
#line 420
        const npy_ushort a2 = (data0[2]);
        const npy_ushort b2 = (data1[2]);
        const npy_ushort c2 = (data_out[2]);
        
#line 420
        const npy_ushort a3 = (data0[3]);
        const npy_ushort b3 = (data1[3]);
        const npy_ushort c3 = (data_out[3]);
        
        #line 427
        const npy_ushort abc0 = a0 * b0 + c0;
        
#line 427
        const npy_ushort abc1 = a1 * b1 + c1;
        
#line 427
        const npy_ushort abc2 = a2 * b2 + c2;
        
#line 427
        const npy_ushort abc3 = a3 * b3 + c3;
        
        #line 432
        data_out[0] = (abc0);
        
#line 432
        data_out[1] = (abc1);
        
#line 432
        data_out[2] = (abc2);
        
#line 432
        data_out[3] = (abc3);
        
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
        const npy_ushort a = (*data0);
        const npy_ushort b = (*data1);
        const npy_ushort c = (*data_out);
        *data_out = (a * b + c);
    }
#endif // NPYV check for npy_ushort

}

/* Some extra specializations for the two operand case */
static void
ushort_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort value0 = (*(npy_ushort *)dataptr[0]);
    npy_ushort *data1 = (npy_ushort *)dataptr[1];
    npy_ushort *data_out = (npy_ushort *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_stride0_contig_outcontig_two (%d)\n",
                                                    (int)count);
    ushort_sum_of_products_muladd(data1, data_out, value0, count);
    
}

static void
ushort_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort value1 = (*(npy_ushort *)dataptr[1]);
    npy_ushort *data0 = (npy_ushort *)dataptr[0];
    npy_ushort *data_out = (npy_ushort *)dataptr[2];

    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_stride0_outcontig_two (%d)\n",
                                                    (int)count);
    ushort_sum_of_products_muladd(data0, data_out, value1, count);
}

static NPY_GCC_OPT_3 void
ushort_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort *data0 = (npy_ushort *)dataptr[0];
    npy_ushort *data1 = (npy_ushort *)dataptr[1];
    npy_ushort accum = 0;

    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_contig_outstride0_two (%d)\n",
                                                    (int)count);
#if 0 // NPYV check for npy_ushort
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
    const int vstep = npyv_nlanes_u16;
    npyv_u16 vaccum = npyv_zero_u16();

    #line 495
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_u16 a0 = npyv_loada_u16(data0 + vstep * 0);
            npyv_u16 b0 = npyv_loada_u16(data1 + vstep * 0);
            
#line 501
            npyv_u16 a1 = npyv_loada_u16(data0 + vstep * 1);
            npyv_u16 b1 = npyv_loada_u16(data1 + vstep * 1);
            
#line 501
            npyv_u16 a2 = npyv_loada_u16(data0 + vstep * 2);
            npyv_u16 b2 = npyv_loada_u16(data1 + vstep * 2);
            
#line 501
            npyv_u16 a3 = npyv_loada_u16(data0 + vstep * 3);
            npyv_u16 b3 = npyv_loada_u16(data1 + vstep * 3);
            
            npyv_u16 ab3 = npyv_muladd_u16(a3, b3, vaccum);
            npyv_u16 ab2 = npyv_muladd_u16(a2, b2, ab3);
            npyv_u16 ab1 = npyv_muladd_u16(a1, b1, ab2);
                    vaccum = npyv_muladd_u16(a0, b0, ab1);
        }
    }
    
#line 495
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
            #line 501
            npyv_u16 a0 = npyv_load_u16(data0 + vstep * 0);
            npyv_u16 b0 = npyv_load_u16(data1 + vstep * 0);
            
#line 501
            npyv_u16 a1 = npyv_load_u16(data0 + vstep * 1);
            npyv_u16 b1 = npyv_load_u16(data1 + vstep * 1);
            
#line 501
            npyv_u16 a2 = npyv_load_u16(data0 + vstep * 2);
            npyv_u16 b2 = npyv_load_u16(data1 + vstep * 2);
            
#line 501
            npyv_u16 a3 = npyv_load_u16(data0 + vstep * 3);
            npyv_u16 b3 = npyv_load_u16(data1 + vstep * 3);
            
            npyv_u16 ab3 = npyv_muladd_u16(a3, b3, vaccum);
            npyv_u16 ab2 = npyv_muladd_u16(a2, b2, ab3);
            npyv_u16 ab1 = npyv_muladd_u16(a1, b1, ab2);
                    vaccum = npyv_muladd_u16(a0, b0, ab1);
        }
    }
    
    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
        npyv_u16 a = npyv_load_tillz_u16(data0, count);
        npyv_u16 b = npyv_load_tillz_u16(data1, count);
        vaccum = npyv_muladd_u16(a, b, vaccum);
    }
    accum = npyv_sum_u16(vaccum);
    npyv_cleanup();
#else
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
        #line 524
        const npy_ushort ab0 = (data0[0]) * (data1[0]);
        
#line 524
        const npy_ushort ab1 = (data0[1]) * (data1[1]);
        
#line 524
        const npy_ushort ab2 = (data0[2]) * (data1[2]);
        
#line 524
        const npy_ushort ab3 = (data0[3]) * (data1[3]);
        
        accum += ab0 + ab1 + ab2 + ab3;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, ++data0, ++data1) {
        const npy_ushort a = (*data0);
        const npy_ushort b = (*data1);
        accum += a * b;
    }
#endif // NPYV check for npy_ushort
    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + accum);
}

static NPY_GCC_OPT_3 void
ushort_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort *data1 = (npy_ushort *)dataptr[1];
    npy_ushort value0 = (*(npy_ushort *)dataptr[0]);
    npy_ushort accum = ushort_sum_of_arr(data1, count);
    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + value0 * accum);
}

static NPY_GCC_OPT_3 void
ushort_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort *data0 = (npy_ushort *)dataptr[0];
    npy_ushort value1 = (*(npy_ushort *)dataptr[1]);
    npy_ushort accum = ushort_sum_of_arr(data0, count);
    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + value1 * accum);
}

#elif 2 == 3 && !0

static void
ushort_sum_of_products_contig_three(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort *data0 = (npy_ushort *)dataptr[0];
    npy_ushort *data1 = (npy_ushort *)dataptr[1];
    npy_ushort *data2 = (npy_ushort *)dataptr[2];
    npy_ushort *data_out = (npy_ushort *)dataptr[3];

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 576
        data_out[0] = ((data0[0]) *
                             (data1[0]) *
                             (data2[0]) +
                             (data_out[0]));

#line 576
        data_out[1] = ((data0[1]) *
                             (data1[1]) *
                             (data2[1]) +
                             (data_out[1]));

#line 576
        data_out[2] = ((data0[2]) *
                             (data1[2]) *
                             (data2[2]) +
                             (data_out[2]));

#line 576
        data_out[3] = ((data0[3]) *
                             (data1[3]) *
                             (data2[3]) +
                             (data_out[3]));

#line 576
        data_out[4] = ((data0[4]) *
                             (data1[4]) *
                             (data2[4]) +
                             (data_out[4]));

#line 576
        data_out[5] = ((data0[5]) *
                             (data1[5]) *
                             (data2[5]) +
                             (data_out[5]));

#line 576
        data_out[6] = ((data0[6]) *
                             (data1[6]) *
                             (data2[6]) +
                             (data_out[6]));

#line 576
        data_out[7] = ((data0[7]) *
                             (data1[7]) *
                             (data2[7]) +
                             (data_out[7]));

        data0 += 8;
        data1 += 8;
        data2 += 8;
        data_out += 8;
    }

    /* Finish off the loop */

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[0] = ((data0[0]) *
                         (data1[0]) *
                         (data2[0]) +
                         (data_out[0]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[1] = ((data0[1]) *
                         (data1[1]) *
                         (data2[1]) +
                         (data_out[1]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[2] = ((data0[2]) *
                         (data1[2]) *
                         (data2[2]) +
                         (data_out[2]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[3] = ((data0[3]) *
                         (data1[3]) *
                         (data2[3]) +
                         (data_out[3]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[4] = ((data0[4]) *
                         (data1[4]) *
                         (data2[4]) +
                         (data_out[4]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[5] = ((data0[5]) *
                         (data1[5]) *
                         (data2[5]) +
                         (data_out[5]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[6] = ((data0[6]) *
                         (data1[6]) *
                         (data2[6]) +
                         (data_out[6]));

#line 592
    if (count-- == 0) {
        return;
    }
    data_out[7] = ((data0[7]) *
                         (data1[7]) *
                         (data2[7]) +
                         (data_out[7]));

}

#else /* 2 > 3 || @complex */

static void
ushort_sum_of_products_contig_two(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ushort *)dataptr[i]);
        }
        *(npy_ushort *)dataptr[nop] = (temp +
                                           (*(npy_ushort *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += sizeof(npy_ushort);
        }
#else /* complex */
#  if 2 <= 3
#    define _SUMPROD_NOP 2
#  else
#    define _SUMPROD_NOP nop
#  endif
        npy_ushort re, im, tmp;
        int i;
        re = ((npy_ushort *)dataptr[0])[0];
        im = ((npy_ushort *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ushort *)dataptr[i])[0] -
                  im * ((npy_ushort *)dataptr[i])[1];
            im = re * ((npy_ushort *)dataptr[i])[1] +
                 im * ((npy_ushort *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_ushort *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[0];
        ((npy_ushort *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += sizeof(npy_ushort);
        }
#  undef _SUMPROD_NOP
#endif
    }
}

#endif /* functions for various 2 */

#if 2 == 1

static NPY_GCC_OPT_3 void
ushort_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
#if !0
    npy_ushort *data = (npy_ushort *)dataptr[0];
    npy_ushort accum = ushort_sum_of_arr(data, count);
    *((npy_ushort *)dataptr[1]) = (accum + (*((npy_ushort *)dataptr[1])));
#else
    npy_ushort accum_re = 0, accum_im = 0;
    npy_ushort *data0 = (npy_ushort *)dataptr[0];
#ifndef NPY_DISABLE_OPTIMIZATION
    for (; count > 4; count -= 4, data0 += 4*2) {
        const npy_ushort re01 = data0[0] + data0[2];
        const npy_ushort re23 = data0[4] + data0[6];
        const npy_ushort im13 = data0[1] + data0[3];
        const npy_ushort im57 = data0[5] + data0[7];
        accum_re += re01 + re23;
        accum_im += im13 + im57;
    }
#endif // !NPY_DISABLE_OPTIMIZATION
    for (; count > 0; --count, data0 += 2) {
        accum_re += data0[0];
        accum_im += data0[1];
    }
    ((npy_ushort *)dataptr[1])[0] += accum_re;
    ((npy_ushort *)dataptr[1])[1] += accum_im;
#endif // !0
}

#endif /* 2 == 1 */

static void
ushort_sum_of_products_outstride0_two(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if 0
    npy_ushort accum_re = 0, accum_im = 0;
#else
    npy_ushort accum = 0;
#endif

#if (2 == 1) || (2 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (2 == 2 || 2 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (2 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif

    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_outstride0_two (%d)\n",
                                                    (int)count);

    while (count--) {
#if !0
#  if 2 == 1
        accum += (*(npy_ushort *)data0);
        data0 += stride0;
#  elif 2 == 2
        accum += (*(npy_ushort *)data0) *
                 (*(npy_ushort *)data1);
        data0 += stride0;
        data1 += stride1;
#  elif 2 == 3
        accum += (*(npy_ushort *)data0) *
                 (*(npy_ushort *)data1) *
                 (*(npy_ushort *)data2);
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
#  else
        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ushort *)dataptr[i]);
        }
        accum += temp;
        for (i = 0; i < nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 2 == 1
        accum_re += ((npy_ushort *)data0)[0];
        accum_im += ((npy_ushort *)data0)[1];
        data0 += stride0;
#  else
#    if 2 <= 3
#define _SUMPROD_NOP 2
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_ushort re, im, tmp;
        int i;
        re = ((npy_ushort *)dataptr[0])[0];
        im = ((npy_ushort *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ushort *)dataptr[i])[0] -
                  im * ((npy_ushort *)dataptr[i])[1];
            im = re * ((npy_ushort *)dataptr[i])[1] +
                 im * ((npy_ushort *)dataptr[i])[0];
            re = tmp;
        }
        accum_re += re;
        accum_im += im;
        for (i = 0; i < _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }

#if 0
#  if 2 <= 3
    ((npy_ushort *)dataptr[2])[0] += accum_re;
    ((npy_ushort *)dataptr[2])[1] += accum_im;
#  else
    ((npy_ushort *)dataptr[nop])[0] += accum_re;
    ((npy_ushort *)dataptr[nop])[1] += accum_im;
#  endif
#else
#  if 2 <= 3
    *((npy_ushort *)dataptr[2]) = (accum +
                                    (*((npy_ushort *)dataptr[2])));
#  else
    *((npy_ushort *)dataptr[nop]) = (accum +
                                    (*((npy_ushort *)dataptr[nop])));
#  endif
#endif

}


#line 131
static void
ushort_sum_of_products_three(int nop, char **dataptr,
                                npy_intp const *strides, npy_intp count)
{
#if (3 == 1) || (3 <= 3 && !0)
    char *data0 = dataptr[0];
    npy_intp stride0 = strides[0];
#endif
#if (3 == 2 || 3 == 3) && !0
    char *data1 = dataptr[1];
    npy_intp stride1 = strides[1];
#endif
#if (3 == 3) && !0
    char *data2 = dataptr[2];
    npy_intp stride2 = strides[2];
#endif
#if (3 == 1) || (3 <= 3 && !0)
    char *data_out = dataptr[3];
    npy_intp stride_out = strides[3];
#endif

    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_three (%d)\n", (int)count);

    while (count--) {
#if !0
#  if 3 == 1
        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) +
                                         (*(npy_ushort *)data_out));
        data0 += stride0;
        data_out += stride_out;
#  elif 3 == 2
        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) *
                                         (*(npy_ushort *)data1) +
                                         (*(npy_ushort *)data_out));
        data0 += stride0;
        data1 += stride1;
        data_out += stride_out;
#  elif 3 == 3
        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) *
                                         (*(npy_ushort *)data1) *
                                         (*(npy_ushort *)data2) +
                                         (*(npy_ushort *)data_out));
        data0 += stride0;
        data1 += stride1;
        data2 += stride2;
        data_out += stride_out;
#  else
        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
        int i;
        for (i = 1; i < nop; ++i) {
            temp *= (*(npy_ushort *)dataptr[i]);
        }
        *(npy_ushort *)dataptr[nop] = (temp +
                                           (*(npy_ushort *)dataptr[i]));
        for (i = 0; i <= nop; ++i) {
            dataptr[i] += strides[i];
        }
#  endif
#else /* complex */
#  if 3 == 1
        ((npy_ushort *)data_out)[0] = ((npy_ushort *)data0)[0] +
                                         ((npy_ushort *)data_out)[0];
        ((npy_ushort *)data_out)[1] = ((npy_ushort *)data0)[1] +
                                         ((npy_ushort *)data_out)[1];
        data0 += stride0;
        data_out += stride_out;
#  else
#    if 3 <= 3
#define _SUMPROD_NOP 3
#    else
#define _SUMPROD_NOP nop
#    endif
        npy_ushort re, im, tmp;
        int i;
        re = ((npy_ushort *)dataptr[0])[0];
        im = ((npy_ushort *)dataptr[0])[1];
        for (i = 1; i < _SUMPROD_NOP; ++i) {
            tmp = re * ((npy_ushort *)dataptr[i])[0] -
                  im * ((npy_ushort *)dataptr[i])[1];
            im = re * ((npy_ushort *)dataptr[i])[1] +
                 im * ((npy_ushort *)dataptr[i])[0];
            re = tmp;
        }
        ((npy_ushort *)dataptr[_SUMPROD_NOP])[0] = re +
                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[0];
        ((npy_ushort *)dataptr[_SUMPROD_NOP])[1] = im +
                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[1];

        for (i = 0; i <= _SUMPROD_NOP; ++i) {
            dataptr[i] += strides[i];
        }
#undef _SUMPROD_NOP
#  endif
#endif
    }
}

#if 3 == 1

static void
ushort_sum_of_products_contig_one(int nop, char **dataptr,
                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
{
    npy_ushort *data0 = (npy_ushort *)dataptr[0];
    npy_ushort *data_out = (npy_ushort *)dataptr[1];

    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_one (%d)\n",
                                                            (int)count);

/* This is placed before the main loop to make small counts faster */
finish_after_unrolled_loop:
    switch (count) {
#line 246
        case 6+1:
#if !0
            data_out[6] = ((data0[6]) +
                                 (data_out[6]));
#else
            ((npy_ushort *)data_out + 2*6)[0] =
                                    ((npy_ushort *)data0 + 2*6)[0] +
                                    ((npy_ushort *)data_out + 2*6)[0];
            ((npy_ushort *)data_out + 2*6)[1] =
                                    ((npy_ushort *)data0 + 2*6)[1] +
                                    ((npy_ushort *)data_out + 2*6)[1];
#endif

#line 246
        case 5+1:
#if !0
            data_out[5] = ((data0[5]) +
                                 (data_out[5]));
#else
            ((npy_ushort *)data_out + 2*5)[0] =
                                    ((npy_ushort *)data0 + 2*5)[0] +
                                    ((npy_ushort *)data_out + 2*5)[0];
            ((npy_ushort *)data_out + 2*5)[1] =
                                    ((npy_ushort *)data0 + 2*5)[1] +
                                    ((npy_ushort *)data_out + 2*5)[1];
#endif

#line 246
        case 4+1:
#if !0
            data_out[4] = ((data0[4]) +
                                 (data_out[4]));
#else
            ((npy_ushort *)data_out + 2*4)[0] =
                                    ((npy_ushort *)data0 + 2*4)[0] +
                                    ((npy_ushort *)data_out + 2*4)[0];
            ((npy_ushort *)data_out + 2*4)[1] =
                                    ((npy_ushort *)data0 + 2*4)[1] +
                                    ((npy_ushort *)data_out + 2*4)[1];
#endif

#line 246
        case 3+1:
#if !0
            data_out[3] = ((data0[3]) +
                                 (data_out[3]));
#else
            ((npy_ushort *)data_out + 2*3)[0] =
                                    ((npy_ushort *)data0 + 2*3)[0] +
                                    ((npy_ushort *)data_out + 2*3)[0];
            ((npy_ushort *)data_out + 2*3)[1] =
                                    ((npy_ushort *)data0 + 2*3)[1] +
                                    ((npy_ushort *)data_out + 2*3)[1];
#endif

#line 246
        case 2+1:
#if !0
            data_out[2] = ((data0[2]) +
                                 (data_out[2]));
#else
            ((npy_ushort *)data_out + 2*2)[0] =
                                    ((npy_ushort *)data0 + 2*2)[0] +
                                    ((npy_ushort *)data_out + 2*2)[0];
            ((npy_ushort *)data_out + 2*2)[1] =
                                    ((npy_ushort *)data0 + 2*2)[1] +
                                    ((npy_ushort *)data_out + 2*2)[1];
#endif

#line 246
        case 1+1:
#if !0
            data_out[1] = ((data0[1]) +
                                 (data_out[1]));
#else
            ((npy_ushort *)data_out + 2*1)[0] =
                                    ((npy_ushort *)data0 + 2*1)[0] +
                                    ((npy_ushort *)data_out + 2*1)[0];
            ((npy_ushort *)data_out + 2*1)[1] =
                                    ((npy_ushort *)data0 + 2*1)[1] +
                                    ((npy_ushort *)data_out + 2*1)[1];
#endif

#line 246
        case 0+1:
#if !0
            data_out[0] = ((data0[0]) +
                                 (data_out[0]));
#else
            ((npy_ushort *)data_out + 2*0)[0] =
                                    ((npy_ushort *)data0 + 2*0)[0] +
                                    ((npy_ushort *)data_out + 2*0)[0];
            ((npy_ushort *)data_out + 2*0)[1] =
                                    ((npy_ushort *)data0 + 2*0)[1] +
                                    ((npy_ushort *)data_out + 2*0)[1];
#endif

        case 0:
            return;
    }

    /* Unroll the loop by 8 */
    while (count >= 8) {
        count -= 8;

#line 270
#if !0
        data_out[0] = ((data0[0]) +
                             (data_out[0]));
#else /* complex */
        ((npy_ushort *)data_out + 2*0)[0] =
                                ((npy_ushort *)data0 + 2*0)[0] +
                                ((npy_ushort *)data_out + 2*0)[0];
        ((npy_ushort *)data_out + 2*0)[1] =
                                ((npy_ushort *)data0 + 2*0)[1] +
                                ((npy_ushort *)data_out + 2*0)[1];
#endif

#line 270
#if !0
        data_out[1] = ((data0[1]) +
                             (data_out[1]));
#else /* complex */
        ((npy_ushort *)data_out + 2*1)[0] =
                                ((npy_ushort *)data0 + 2*1)[0] +
                                ((npy_ushort *)data_out + 2*1)[0];
        ((npy_ushort *)data_out + 2*1)[1] =
                                ((npy_ushort *)data0 + 2*1)[1] +
                                ((npy_ushort *)data_out + 2*1)[1];
#endif

#line 270
#if !0
        data_out[2] = ((data0[2]) +
                             (data_out[2]));
#else /* complex */
        ((npy_ushort *)data_out + 2*2)[0] =
                                ((npy_ushort *)data0 + 2*2)[0] +
                                ((npy_ushort *)data_out + 2*2)[0];
        ((npy_ushort *)data_out + 2*2)[1] =
                                ((npy_ushort *)data0 + 2*2)[1] +
                                ((npy_ushort *)data_out + 2*2)[1];
#endif

#line 270
#if !0
        data_out[3] = ((data0[3]) +
                             (data_out[3]));
#else /* complex */
        ((npy_ushort *)data_out + 2*3)[0] =
                                ((npy_ushort *)data0 + 2*3)[0] +
                                ((npy_ushort *)data_out + 2*3)[0];
        ((npy_ushort *)data_out + 2*3)[1] =
                                ((npy_ushort *)data0 + 2*3)[1] +
                                ((npy_ushort *)data_out + 2*3)[1];
#endif

#line 270
#if !0
        data_out[4] = ((data0[4]) +
                             (data_out[4]));
#else /* complex */
        ((npy_ushort *)data_out + 2*4)[0] =
                                ((npy_ushort *)data0 + 2*4)[0] +
                                ((npy_ushort *)data_out + 2*4)[0];
        ((npy_ushort *)data_out + 2*4)[1] =
                                ((npy_ushort *)data0 + 2*4)[1] +
                                ((npy_ushort *)data_out + 2*4)[1];
#endif

#line 270
#if !0
        data_out[5] = ((data0[5]) +
                             (data_out[5]));
#else /* complex */
        ((npy_ushort *)data_out + 2*5)[0] =
                                ((npy_ushort *)data0 + 2*5)[0] +
                                ((npy_ushort *)data_out + 2*5)[0];
        ((npy_ushort *)data_out + 2*5)[1] =
                                ((npy_ushort *)data0 + 2*5)[1] +
                                ((npy_ushort *)data_out + 2*5)[1];
#endif

#line 270
#if !0
        data_out[6] = ((data0[6]) +
                             (data_out[6]));
#else /* complex */
        ((npy_ushort *)data_out + 2*6)[0] =
                                ((npy_ushort *)data0 + 2*6)[0] +
                                ((npy_ushort *)data_out + 2*6)[0];
        ((npy_ushort *)data_out + 2*6)[1] =
                                ((npy_ushort *)data0 + 2*6)[1] +
                                ((npy_ushort *)data_out + 2*6)[1];
#endif

#line 270
#if !0
        data_out[7] = ((data0[7]) +
                             (data_out[7]));
#else /* complex */
        ((npy_ushort *)data_out + 2*7)[0] =
                                ((npy_ushort *)data0 + 2*7)[0] +
                                ((npy_ushort *)data_out + 2*7)[0];
        ((npy_ushort *)data_out + 2*7)[1] =
                                ((npy_ushort *)data0 + 2*7)[1] +
                                ((npy_ushort *)data_out + 2*7)[1];
#endif

        data0 += 8;
        data_out += 8;
    }

    /* Finish off the loop */
    goto finish_after_unrolled_loop;
}

#elif 3 == 2 && !0

// calculate the multiply and add operation such as dataout = data*scalar+dataout
static NPY_GCC_OPT_3 void
ushort_sum_of_products_muladd(npy_ushort *data, npy_ushort *data_out, npy_ushort scalar, npy_intp count)
{
#if 0 // NPYV check for npy_ushort
    /* Use aligned instructions if possible */
    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
    const int vstep = npyv_nlanes_u16;
    const npyv_u16 v_scalar = npyv_setall_u16(scalar);
    #line 306
    if(is_aligned) {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_u16 b0 = npyv_loada_u16(data + vstep * 0);
            npyv_u16 c0 = npyv_loada_u16(data_out + vstep * 0);
            
#line 312
            npyv_u16 b1 = npyv_loada_u16(data + vstep * 1);
            npyv_u16 c1 = npyv_loada_u16(data_out + vstep * 1);
            
#line 312
            npyv_u16 b2 = npyv_loada_u16(data + vstep * 2);
            npyv_u16 c2 = npyv_loada_u16(data_out + vstep * 2);
            
#line 312
            npyv_u16 b3 = npyv_loada_u16(data + vstep * 3);
            npyv_u16 c3 = npyv_loada_u16(data_out + vstep * 3);
            
            #line 318
            npyv_u16 abc0 = npyv_muladd_u16(v_scalar, b0, c0);
            
#line 318
            npyv_u16 abc1 = npyv_muladd_u16(v_scalar, b1, c1);
            
#line 318
            npyv_u16 abc2 = npyv_muladd_u16(v_scalar, b2, c2);
            
#line 318
            npyv_u16 abc3 = npyv_muladd_u16(v_scalar, b3, c3);
            
            #line 323
            npyv_storea_u16(data_out + vstep * 0, abc0);
            
#line 323
            npyv_storea_u16(data_out + vstep * 1, abc1);
            
#line 323
            npyv_storea_u16(data_out + vstep * 2, abc2);
            
#line 323
            npyv_storea_u16(data_out + vstep * 3, abc3);
            
        }
    }
    
#line 306
    else {
        const npy_intp vstepx4 = vstep * 4;
        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
            #line 312
            npyv_u16 b0 = npyv_load_u16(data + vstep * 0);
            npyv_u16 c0 = npyv_load_u16(data_out + vstep * 0);
            
#line 312
            npyv_u16 b1 = npyv_load_u16(data + vstep * 1);
            npyv_u16 c1 = npyv_load_u16(data_out + vstep * 1);
            
#line 312
            npyv_u16 b2 = npyv_load_u16(data + vstep * 2);
            npyv_u16 c2 = npyv_load_u16(data_out + vstep * 2);
            
#line 312
            npyv_u16 b3 = npyv_load_u16(data + vstep * 3);
            npyv_u16 c3 = npyv_load_u16(data_out + vstep * 3);
            
            #line 318
            npyv_u16 abc0 = npyv_muladd_u16(v_scalar, b0, c0);
            
#line 318
            npyv_u16 abc1 = npyv_muladd_u16(v_scalar, b1, c1);
            
#line 318
            npyv_u16 abc2 = npyv_muladd_u16(v_scalar, b2, c2);
            
#line 318
            npyv_u16 abc3 = npyv_muladd_u16(v_scalar, b3, c3);
            
            #line 323
            npyv_store_u16(data_out + vstep * 0, abc0);
            
#line 323
            npyv_store_u16(data_