/*
**
** PHiPAC Matrix-Matrix Code for the operation:
**    C = alpha*transpose(A)*transpose(B) + C
**
** Automatically Generated by mm_cgen ($Revision: 1.27 $) using the command:
**    ./mm_cgen -prec double -opA T -opB T -alpha c -sp 1 -holdstripe B -l0 1 20 12 -file ./src/mm_double_TT_c_general.c -routine_name mm_double_TT_c_general 
**
** Run './mm_cgen -help' for help.
**
** Generated on: Wednesday July 10 2013, 08:33:28 PDT
** Created by: Jeff Bilmes <bilmes@cs.berkeley.edu>
**             http://www.icsi.berkeley.edu/~bilmes/phipac
**
**
** Routine Usage: General (M,K,N) = (M, K, N) matrix multiply
**    mm_double_TT_c_general(const int M, const int K, const int N, const double *const A, const double *const B, double *const C, const int Astride, const int Bstride, const int Cstride, const double alpha)
** where
**  transpose(A) is an MxK matrix
**  transpose(B) is an KxN matrix
**  C is an MxN matrix
**  Astride is the number of entries between the start of each row of A
**  Bstride is the number of entries between the start of each row of B
**  Cstride is the number of entries between the start of each row of C
**
**
** "Copyright (c) 1995 The Regents of the University of California.  All
** rights reserved."  Permission to use, copy, modify, and distribute
** this software and its documentation for any purpose, without fee, and
** without written agreement is hereby granted, provided that the above
** copyright notice and the following two paragraphs appear in all copies
** of this software.
**
** IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
** DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT
** OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF
** CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**
** THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
** INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
** AND FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
** ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATION TO
** PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
**
*/

/*
 * General (M,K,N) = (M, K, N) matrix multiply
 */
void
mm_double_TT_c_general(const int M, const int K, const int N, const double *const A, const double *const B, double *const C, const int Astride, const int Bstride, const int Cstride, const double alpha)
{
   const double *a,*b;
   double *c;
   const double *ap;
   const double *bp_0,*bp_1,*bp_2,*bp_3,*bp_4,*bp_5,*bp_6,*bp_7,*bp_8,*bp_9,*bp_10,*bp_11;
   double *cp;
   const int B_sbs_stride = Bstride*12;
   const int C_sbs_stride = Cstride*1;
   const int k_marg_el = K % 20;
   const int k_norm = (K - k_marg_el)*Astride;
   const int m_marg_el = M & 0;
   const int m_norm = M - m_marg_el;
   const int n_marg_el = N % 12;
   const int n_norm = N - n_marg_el;
   double *const c_endp = C+m_norm*Cstride;
   register double c0_0,c0_1,c0_2,c0_3,c0_4,c0_5,c0_6,c0_7,c0_8,c0_9,c0_10,c0_11;
   for (c=C,a=A; c!= c_endp; c+=C_sbs_stride,a+=1) {
      const double* const ap_endp = a + k_norm;
      double* const cp_endp = c + n_norm;
      for (b=B,cp=c; cp!=cp_endp; b+=B_sbs_stride,cp+=12) {
         register double _b0,_b1,_b2,_b3,_b4,_b5,_b6,_b7,_b8,_b9,_b10,_b11;
         register double _a0;
         double *_cp;
         ap=a;
         bp_0 = b;
         bp_1 = bp_0 + Bstride;
         bp_2 = bp_1 + Bstride;
         bp_3 = bp_2 + Bstride;
         bp_4 = bp_3 + Bstride;
         bp_5 = bp_4 + Bstride;
         bp_6 = bp_5 + Bstride;
         bp_7 = bp_6 + Bstride;
         bp_8 = bp_7 + Bstride;
         bp_9 = bp_8 + Bstride;
         bp_10 = bp_9 + Bstride;
         bp_11 = bp_10 + Bstride;
         c0_0 = 0.0; c0_1 = 0.0; c0_2 = 0.0; c0_3 = 0.0; c0_4 = 0.0; c0_5 = 0.0; c0_6 = 0.0; c0_7 = 0.0; c0_8 = 0.0; c0_9 = 0.0; c0_10 = 0.0; c0_11 = 0.0; 
         for (;ap!=ap_endp; bp_0+=20,bp_1+=20,bp_2+=20,bp_3+=20,bp_4+=20,bp_5+=20,bp_6+=20,bp_7+=20,bp_8+=20,bp_9+=20,bp_10+=20,bp_11+=20) {
            /* Fixed M,K,N = 1,20,12 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; _b4 = bp_4[0]; _b5 = bp_5[0]; _b6 = bp_6[0]; _b7 = bp_7[0]; _b8 = bp_8[0]; _b9 = bp_9[0]; _b10 = bp_10[0]; _b11 = bp_11[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; _b2 = bp_2[1]; _b3 = bp_3[1]; _b4 = bp_4[1]; _b5 = bp_5[1]; _b6 = bp_6[1]; _b7 = bp_7[1]; _b8 = bp_8[1]; _b9 = bp_9[1]; _b10 = bp_10[1]; _b11 = bp_11[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; _b1 = bp_1[2]; _b2 = bp_2[2]; _b3 = bp_3[2]; _b4 = bp_4[2]; _b5 = bp_5[2]; _b6 = bp_6[2]; _b7 = bp_7[2]; _b8 = bp_8[2]; _b9 = bp_9[2]; _b10 = bp_10[2]; _b11 = bp_11[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; _b1 = bp_1[3]; _b2 = bp_2[3]; _b3 = bp_3[3]; _b4 = bp_4[3]; _b5 = bp_5[3]; _b6 = bp_6[3]; _b7 = bp_7[3]; _b8 = bp_8[3]; _b9 = bp_9[3]; _b10 = bp_10[3]; _b11 = bp_11[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[4]; _b1 = bp_1[4]; _b2 = bp_2[4]; _b3 = bp_3[4]; _b4 = bp_4[4]; _b5 = bp_5[4]; _b6 = bp_6[4]; _b7 = bp_7[4]; _b8 = bp_8[4]; _b9 = bp_9[4]; _b10 = bp_10[4]; _b11 = bp_11[4]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[5]; _b1 = bp_1[5]; _b2 = bp_2[5]; _b3 = bp_3[5]; _b4 = bp_4[5]; _b5 = bp_5[5]; _b6 = bp_6[5]; _b7 = bp_7[5]; _b8 = bp_8[5]; _b9 = bp_9[5]; _b10 = bp_10[5]; _b11 = bp_11[5]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[6]; _b1 = bp_1[6]; _b2 = bp_2[6]; _b3 = bp_3[6]; _b4 = bp_4[6]; _b5 = bp_5[6]; _b6 = bp_6[6]; _b7 = bp_7[6]; _b8 = bp_8[6]; _b9 = bp_9[6]; _b10 = bp_10[6]; _b11 = bp_11[6]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[7]; _b1 = bp_1[7]; _b2 = bp_2[7]; _b3 = bp_3[7]; _b4 = bp_4[7]; _b5 = bp_5[7]; _b6 = bp_6[7]; _b7 = bp_7[7]; _b8 = bp_8[7]; _b9 = bp_9[7]; _b10 = bp_10[7]; _b11 = bp_11[7]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[8]; _b1 = bp_1[8]; _b2 = bp_2[8]; _b3 = bp_3[8]; _b4 = bp_4[8]; _b5 = bp_5[8]; _b6 = bp_6[8]; _b7 = bp_7[8]; _b8 = bp_8[8]; _b9 = bp_9[8]; _b10 = bp_10[8]; _b11 = bp_11[8]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[9]; _b1 = bp_1[9]; _b2 = bp_2[9]; _b3 = bp_3[9]; _b4 = bp_4[9]; _b5 = bp_5[9]; _b6 = bp_6[9]; _b7 = bp_7[9]; _b8 = bp_8[9]; _b9 = bp_9[9]; _b10 = bp_10[9]; _b11 = bp_11[9]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[10]; _b1 = bp_1[10]; _b2 = bp_2[10]; _b3 = bp_3[10]; _b4 = bp_4[10]; _b5 = bp_5[10]; _b6 = bp_6[10]; _b7 = bp_7[10]; _b8 = bp_8[10]; _b9 = bp_9[10]; _b10 = bp_10[10]; _b11 = bp_11[10]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[11]; _b1 = bp_1[11]; _b2 = bp_2[11]; _b3 = bp_3[11]; _b4 = bp_4[11]; _b5 = bp_5[11]; _b6 = bp_6[11]; _b7 = bp_7[11]; _b8 = bp_8[11]; _b9 = bp_9[11]; _b10 = bp_10[11]; _b11 = bp_11[11]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[12]; _b1 = bp_1[12]; _b2 = bp_2[12]; _b3 = bp_3[12]; _b4 = bp_4[12]; _b5 = bp_5[12]; _b6 = bp_6[12]; _b7 = bp_7[12]; _b8 = bp_8[12]; _b9 = bp_9[12]; _b10 = bp_10[12]; _b11 = bp_11[12]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[13]; _b1 = bp_1[13]; _b2 = bp_2[13]; _b3 = bp_3[13]; _b4 = bp_4[13]; _b5 = bp_5[13]; _b6 = bp_6[13]; _b7 = bp_7[13]; _b8 = bp_8[13]; _b9 = bp_9[13]; _b10 = bp_10[13]; _b11 = bp_11[13]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[14]; _b1 = bp_1[14]; _b2 = bp_2[14]; _b3 = bp_3[14]; _b4 = bp_4[14]; _b5 = bp_5[14]; _b6 = bp_6[14]; _b7 = bp_7[14]; _b8 = bp_8[14]; _b9 = bp_9[14]; _b10 = bp_10[14]; _b11 = bp_11[14]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[15]; _b1 = bp_1[15]; _b2 = bp_2[15]; _b3 = bp_3[15]; _b4 = bp_4[15]; _b5 = bp_5[15]; _b6 = bp_6[15]; _b7 = bp_7[15]; _b8 = bp_8[15]; _b9 = bp_9[15]; _b10 = bp_10[15]; _b11 = bp_11[15]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[16]; _b1 = bp_1[16]; _b2 = bp_2[16]; _b3 = bp_3[16]; _b4 = bp_4[16]; _b5 = bp_5[16]; _b6 = bp_6[16]; _b7 = bp_7[16]; _b8 = bp_8[16]; _b9 = bp_9[16]; _b10 = bp_10[16]; _b11 = bp_11[16]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[17]; _b1 = bp_1[17]; _b2 = bp_2[17]; _b3 = bp_3[17]; _b4 = bp_4[17]; _b5 = bp_5[17]; _b6 = bp_6[17]; _b7 = bp_7[17]; _b8 = bp_8[17]; _b9 = bp_9[17]; _b10 = bp_10[17]; _b11 = bp_11[17]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[18]; _b1 = bp_1[18]; _b2 = bp_2[18]; _b3 = bp_3[18]; _b4 = bp_4[18]; _b5 = bp_5[18]; _b6 = bp_6[18]; _b7 = bp_7[18]; _b8 = bp_8[18]; _b9 = bp_9[18]; _b10 = bp_10[18]; _b11 = bp_11[18]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[19]; _b1 = bp_1[19]; _b2 = bp_2[19]; _b3 = bp_3[19]; _b4 = bp_4[19]; _b5 = bp_5[19]; _b6 = bp_6[19]; _b7 = bp_7[19]; _b8 = bp_8[19]; _b9 = bp_9[19]; _b10 = bp_10[19]; _b11 = bp_11[19]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;

         }
         if (k_marg_el & 0x10) {
            /* Fixed M,K,N = 1,16,12 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; _b4 = bp_4[0]; _b5 = bp_5[0]; _b6 = bp_6[0]; _b7 = bp_7[0]; _b8 = bp_8[0]; _b9 = bp_9[0]; _b10 = bp_10[0]; _b11 = bp_11[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; _b2 = bp_2[1]; _b3 = bp_3[1]; _b4 = bp_4[1]; _b5 = bp_5[1]; _b6 = bp_6[1]; _b7 = bp_7[1]; _b8 = bp_8[1]; _b9 = bp_9[1]; _b10 = bp_10[1]; _b11 = bp_11[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; _b1 = bp_1[2]; _b2 = bp_2[2]; _b3 = bp_3[2]; _b4 = bp_4[2]; _b5 = bp_5[2]; _b6 = bp_6[2]; _b7 = bp_7[2]; _b8 = bp_8[2]; _b9 = bp_9[2]; _b10 = bp_10[2]; _b11 = bp_11[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; _b1 = bp_1[3]; _b2 = bp_2[3]; _b3 = bp_3[3]; _b4 = bp_4[3]; _b5 = bp_5[3]; _b6 = bp_6[3]; _b7 = bp_7[3]; _b8 = bp_8[3]; _b9 = bp_9[3]; _b10 = bp_10[3]; _b11 = bp_11[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[4]; _b1 = bp_1[4]; _b2 = bp_2[4]; _b3 = bp_3[4]; _b4 = bp_4[4]; _b5 = bp_5[4]; _b6 = bp_6[4]; _b7 = bp_7[4]; _b8 = bp_8[4]; _b9 = bp_9[4]; _b10 = bp_10[4]; _b11 = bp_11[4]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[5]; _b1 = bp_1[5]; _b2 = bp_2[5]; _b3 = bp_3[5]; _b4 = bp_4[5]; _b5 = bp_5[5]; _b6 = bp_6[5]; _b7 = bp_7[5]; _b8 = bp_8[5]; _b9 = bp_9[5]; _b10 = bp_10[5]; _b11 = bp_11[5]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[6]; _b1 = bp_1[6]; _b2 = bp_2[6]; _b3 = bp_3[6]; _b4 = bp_4[6]; _b5 = bp_5[6]; _b6 = bp_6[6]; _b7 = bp_7[6]; _b8 = bp_8[6]; _b9 = bp_9[6]; _b10 = bp_10[6]; _b11 = bp_11[6]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[7]; _b1 = bp_1[7]; _b2 = bp_2[7]; _b3 = bp_3[7]; _b4 = bp_4[7]; _b5 = bp_5[7]; _b6 = bp_6[7]; _b7 = bp_7[7]; _b8 = bp_8[7]; _b9 = bp_9[7]; _b10 = bp_10[7]; _b11 = bp_11[7]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[8]; _b1 = bp_1[8]; _b2 = bp_2[8]; _b3 = bp_3[8]; _b4 = bp_4[8]; _b5 = bp_5[8]; _b6 = bp_6[8]; _b7 = bp_7[8]; _b8 = bp_8[8]; _b9 = bp_9[8]; _b10 = bp_10[8]; _b11 = bp_11[8]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[9]; _b1 = bp_1[9]; _b2 = bp_2[9]; _b3 = bp_3[9]; _b4 = bp_4[9]; _b5 = bp_5[9]; _b6 = bp_6[9]; _b7 = bp_7[9]; _b8 = bp_8[9]; _b9 = bp_9[9]; _b10 = bp_10[9]; _b11 = bp_11[9]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[10]; _b1 = bp_1[10]; _b2 = bp_2[10]; _b3 = bp_3[10]; _b4 = bp_4[10]; _b5 = bp_5[10]; _b6 = bp_6[10]; _b7 = bp_7[10]; _b8 = bp_8[10]; _b9 = bp_9[10]; _b10 = bp_10[10]; _b11 = bp_11[10]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[11]; _b1 = bp_1[11]; _b2 = bp_2[11]; _b3 = bp_3[11]; _b4 = bp_4[11]; _b5 = bp_5[11]; _b6 = bp_6[11]; _b7 = bp_7[11]; _b8 = bp_8[11]; _b9 = bp_9[11]; _b10 = bp_10[11]; _b11 = bp_11[11]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[12]; _b1 = bp_1[12]; _b2 = bp_2[12]; _b3 = bp_3[12]; _b4 = bp_4[12]; _b5 = bp_5[12]; _b6 = bp_6[12]; _b7 = bp_7[12]; _b8 = bp_8[12]; _b9 = bp_9[12]; _b10 = bp_10[12]; _b11 = bp_11[12]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[13]; _b1 = bp_1[13]; _b2 = bp_2[13]; _b3 = bp_3[13]; _b4 = bp_4[13]; _b5 = bp_5[13]; _b6 = bp_6[13]; _b7 = bp_7[13]; _b8 = bp_8[13]; _b9 = bp_9[13]; _b10 = bp_10[13]; _b11 = bp_11[13]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[14]; _b1 = bp_1[14]; _b2 = bp_2[14]; _b3 = bp_3[14]; _b4 = bp_4[14]; _b5 = bp_5[14]; _b6 = bp_6[14]; _b7 = bp_7[14]; _b8 = bp_8[14]; _b9 = bp_9[14]; _b10 = bp_10[14]; _b11 = bp_11[14]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[15]; _b1 = bp_1[15]; _b2 = bp_2[15]; _b3 = bp_3[15]; _b4 = bp_4[15]; _b5 = bp_5[15]; _b6 = bp_6[15]; _b7 = bp_7[15]; _b8 = bp_8[15]; _b9 = bp_9[15]; _b10 = bp_10[15]; _b11 = bp_11[15]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;

            bp_0+=16;bp_1+=16;bp_2+=16;bp_3+=16;bp_4+=16;bp_5+=16;bp_6+=16;bp_7+=16;bp_8+=16;bp_9+=16;bp_10+=16;bp_11+=16;
         }
         if (k_marg_el & 0x8) {
            /* Fixed M,K,N = 1,8,12 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; _b4 = bp_4[0]; _b5 = bp_5[0]; _b6 = bp_6[0]; _b7 = bp_7[0]; _b8 = bp_8[0]; _b9 = bp_9[0]; _b10 = bp_10[0]; _b11 = bp_11[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; _b2 = bp_2[1]; _b3 = bp_3[1]; _b4 = bp_4[1]; _b5 = bp_5[1]; _b6 = bp_6[1]; _b7 = bp_7[1]; _b8 = bp_8[1]; _b9 = bp_9[1]; _b10 = bp_10[1]; _b11 = bp_11[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; _b1 = bp_1[2]; _b2 = bp_2[2]; _b3 = bp_3[2]; _b4 = bp_4[2]; _b5 = bp_5[2]; _b6 = bp_6[2]; _b7 = bp_7[2]; _b8 = bp_8[2]; _b9 = bp_9[2]; _b10 = bp_10[2]; _b11 = bp_11[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; _b1 = bp_1[3]; _b2 = bp_2[3]; _b3 = bp_3[3]; _b4 = bp_4[3]; _b5 = bp_5[3]; _b6 = bp_6[3]; _b7 = bp_7[3]; _b8 = bp_8[3]; _b9 = bp_9[3]; _b10 = bp_10[3]; _b11 = bp_11[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[4]; _b1 = bp_1[4]; _b2 = bp_2[4]; _b3 = bp_3[4]; _b4 = bp_4[4]; _b5 = bp_5[4]; _b6 = bp_6[4]; _b7 = bp_7[4]; _b8 = bp_8[4]; _b9 = bp_9[4]; _b10 = bp_10[4]; _b11 = bp_11[4]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[5]; _b1 = bp_1[5]; _b2 = bp_2[5]; _b3 = bp_3[5]; _b4 = bp_4[5]; _b5 = bp_5[5]; _b6 = bp_6[5]; _b7 = bp_7[5]; _b8 = bp_8[5]; _b9 = bp_9[5]; _b10 = bp_10[5]; _b11 = bp_11[5]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[6]; _b1 = bp_1[6]; _b2 = bp_2[6]; _b3 = bp_3[6]; _b4 = bp_4[6]; _b5 = bp_5[6]; _b6 = bp_6[6]; _b7 = bp_7[6]; _b8 = bp_8[6]; _b9 = bp_9[6]; _b10 = bp_10[6]; _b11 = bp_11[6]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[7]; _b1 = bp_1[7]; _b2 = bp_2[7]; _b3 = bp_3[7]; _b4 = bp_4[7]; _b5 = bp_5[7]; _b6 = bp_6[7]; _b7 = bp_7[7]; _b8 = bp_8[7]; _b9 = bp_9[7]; _b10 = bp_10[7]; _b11 = bp_11[7]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;

            bp_0+=8;bp_1+=8;bp_2+=8;bp_3+=8;bp_4+=8;bp_5+=8;bp_6+=8;bp_7+=8;bp_8+=8;bp_9+=8;bp_10+=8;bp_11+=8;
         }
         if (k_marg_el & 0x4) {
            /* Fixed M,K,N = 1,4,12 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; _b4 = bp_4[0]; _b5 = bp_5[0]; _b6 = bp_6[0]; _b7 = bp_7[0]; _b8 = bp_8[0]; _b9 = bp_9[0]; _b10 = bp_10[0]; _b11 = bp_11[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; _b2 = bp_2[1]; _b3 = bp_3[1]; _b4 = bp_4[1]; _b5 = bp_5[1]; _b6 = bp_6[1]; _b7 = bp_7[1]; _b8 = bp_8[1]; _b9 = bp_9[1]; _b10 = bp_10[1]; _b11 = bp_11[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; _b1 = bp_1[2]; _b2 = bp_2[2]; _b3 = bp_3[2]; _b4 = bp_4[2]; _b5 = bp_5[2]; _b6 = bp_6[2]; _b7 = bp_7[2]; _b8 = bp_8[2]; _b9 = bp_9[2]; _b10 = bp_10[2]; _b11 = bp_11[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; _b1 = bp_1[3]; _b2 = bp_2[3]; _b3 = bp_3[3]; _b4 = bp_4[3]; _b5 = bp_5[3]; _b6 = bp_6[3]; _b7 = bp_7[3]; _b8 = bp_8[3]; _b9 = bp_9[3]; _b10 = bp_10[3]; _b11 = bp_11[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;

            bp_0+=4;bp_1+=4;bp_2+=4;bp_3+=4;bp_4+=4;bp_5+=4;bp_6+=4;bp_7+=4;bp_8+=4;bp_9+=4;bp_10+=4;bp_11+=4;
         }
         if (k_marg_el & 0x2) {
            /* Fixed M,K,N = 1,2,12 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; _b4 = bp_4[0]; _b5 = bp_5[0]; _b6 = bp_6[0]; _b7 = bp_7[0]; _b8 = bp_8[0]; _b9 = bp_9[0]; _b10 = bp_10[0]; _b11 = bp_11[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; _b2 = bp_2[1]; _b3 = bp_3[1]; _b4 = bp_4[1]; _b5 = bp_5[1]; _b6 = bp_6[1]; _b7 = bp_7[1]; _b8 = bp_8[1]; _b9 = bp_9[1]; _b10 = bp_10[1]; _b11 = bp_11[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;

            bp_0+=2;bp_1+=2;bp_2+=2;bp_3+=2;bp_4+=2;bp_5+=2;bp_6+=2;bp_7+=2;bp_8+=2;bp_9+=2;bp_10+=2;bp_11+=2;
         }
         if (k_marg_el & 0x1) {
            /* Fixed M,K,N = 1,1,12 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; _b4 = bp_4[0]; _b5 = bp_5[0]; _b6 = bp_6[0]; _b7 = bp_7[0]; _b8 = bp_8[0]; _b9 = bp_9[0]; _b10 = bp_10[0]; _b11 = bp_11[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; c0_8 += _b8*_a0; c0_9 += _b9*_a0; c0_10 += _b10*_a0; c0_11 += _b11*_a0; 
            ap += Astride;

         }
         _cp=cp;_cp[0]+=alpha*c0_0;_cp[1]+=alpha*c0_1;_cp[2]+=alpha*c0_2;_cp[3]+=alpha*c0_3;_cp[4]+=alpha*c0_4;_cp[5]+=alpha*c0_5;_cp[6]+=alpha*c0_6;_cp[7]+=alpha*c0_7;_cp[8]+=alpha*c0_8;_cp[9]+=alpha*c0_9;_cp[10]+=alpha*c0_10;_cp[11]+=alpha*c0_11;
      }
   }
   for (c=C,a=A; c!= c_endp; c+=C_sbs_stride,a+=1) {
      const double* const ap_endp = a + k_norm;
      b = B+n_norm*Bstride;
      cp = c+n_norm;
      if (n_marg_el & 0x8) {
         register double _b0,_b1,_b2,_b3,_b4,_b5,_b6,_b7;
         register double _a0;
         double *_cp;
         ap=a;
         bp_0 = b;
         bp_1 = bp_0 + Bstride;
         bp_2 = bp_1 + Bstride;
         bp_3 = bp_2 + Bstride;
         bp_4 = bp_3 + Bstride;
         bp_5 = bp_4 + Bstride;
         bp_6 = bp_5 + Bstride;
         bp_7 = bp_6 + Bstride;
         c0_0 = 0.0; c0_1 = 0.0; c0_2 = 0.0; c0_3 = 0.0; c0_4 = 0.0; c0_5 = 0.0; c0_6 = 0.0; c0_7 = 0.0; 
         for (;ap!=ap_endp; bp_0+=20,bp_1+=20,bp_2+=20,bp_3+=20,bp_4+=20,bp_5+=20,bp_6+=20,bp_7+=20) {
            /* Fixed M,K,N = 1,20,8 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; _b4 = bp_4[0]; _b5 = bp_5[0]; _b6 = bp_6[0]; _b7 = bp_7[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; _b2 = bp_2[1]; _b3 = bp_3[1]; _b4 = bp_4[1]; _b5 = bp_5[1]; _b6 = bp_6[1]; _b7 = bp_7[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; _b1 = bp_1[2]; _b2 = bp_2[2]; _b3 = bp_3[2]; _b4 = bp_4[2]; _b5 = bp_5[2]; _b6 = bp_6[2]; _b7 = bp_7[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; _b1 = bp_1[3]; _b2 = bp_2[3]; _b3 = bp_3[3]; _b4 = bp_4[3]; _b5 = bp_5[3]; _b6 = bp_6[3]; _b7 = bp_7[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[4]; _b1 = bp_1[4]; _b2 = bp_2[4]; _b3 = bp_3[4]; _b4 = bp_4[4]; _b5 = bp_5[4]; _b6 = bp_6[4]; _b7 = bp_7[4]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[5]; _b1 = bp_1[5]; _b2 = bp_2[5]; _b3 = bp_3[5]; _b4 = bp_4[5]; _b5 = bp_5[5]; _b6 = bp_6[5]; _b7 = bp_7[5]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[6]; _b1 = bp_1[6]; _b2 = bp_2[6]; _b3 = bp_3[6]; _b4 = bp_4[6]; _b5 = bp_5[6]; _b6 = bp_6[6]; _b7 = bp_7[6]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[7]; _b1 = bp_1[7]; _b2 = bp_2[7]; _b3 = bp_3[7]; _b4 = bp_4[7]; _b5 = bp_5[7]; _b6 = bp_6[7]; _b7 = bp_7[7]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[8]; _b1 = bp_1[8]; _b2 = bp_2[8]; _b3 = bp_3[8]; _b4 = bp_4[8]; _b5 = bp_5[8]; _b6 = bp_6[8]; _b7 = bp_7[8]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[9]; _b1 = bp_1[9]; _b2 = bp_2[9]; _b3 = bp_3[9]; _b4 = bp_4[9]; _b5 = bp_5[9]; _b6 = bp_6[9]; _b7 = bp_7[9]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[10]; _b1 = bp_1[10]; _b2 = bp_2[10]; _b3 = bp_3[10]; _b4 = bp_4[10]; _b5 = bp_5[10]; _b6 = bp_6[10]; _b7 = bp_7[10]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[11]; _b1 = bp_1[11]; _b2 = bp_2[11]; _b3 = bp_3[11]; _b4 = bp_4[11]; _b5 = bp_5[11]; _b6 = bp_6[11]; _b7 = bp_7[11]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[12]; _b1 = bp_1[12]; _b2 = bp_2[12]; _b3 = bp_3[12]; _b4 = bp_4[12]; _b5 = bp_5[12]; _b6 = bp_6[12]; _b7 = bp_7[12]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[13]; _b1 = bp_1[13]; _b2 = bp_2[13]; _b3 = bp_3[13]; _b4 = bp_4[13]; _b5 = bp_5[13]; _b6 = bp_6[13]; _b7 = bp_7[13]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[14]; _b1 = bp_1[14]; _b2 = bp_2[14]; _b3 = bp_3[14]; _b4 = bp_4[14]; _b5 = bp_5[14]; _b6 = bp_6[14]; _b7 = bp_7[14]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[15]; _b1 = bp_1[15]; _b2 = bp_2[15]; _b3 = bp_3[15]; _b4 = bp_4[15]; _b5 = bp_5[15]; _b6 = bp_6[15]; _b7 = bp_7[15]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[16]; _b1 = bp_1[16]; _b2 = bp_2[16]; _b3 = bp_3[16]; _b4 = bp_4[16]; _b5 = bp_5[16]; _b6 = bp_6[16]; _b7 = bp_7[16]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[17]; _b1 = bp_1[17]; _b2 = bp_2[17]; _b3 = bp_3[17]; _b4 = bp_4[17]; _b5 = bp_5[17]; _b6 = bp_6[17]; _b7 = bp_7[17]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[18]; _b1 = bp_1[18]; _b2 = bp_2[18]; _b3 = bp_3[18]; _b4 = bp_4[18]; _b5 = bp_5[18]; _b6 = bp_6[18]; _b7 = bp_7[18]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[19]; _b1 = bp_1[19]; _b2 = bp_2[19]; _b3 = bp_3[19]; _b4 = bp_4[19]; _b5 = bp_5[19]; _b6 = bp_6[19]; _b7 = bp_7[19]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;

         }
         if (k_marg_el & 0x10) {
            /* Fixed M,K,N = 1,16,8 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; _b4 = bp_4[0]; _b5 = bp_5[0]; _b6 = bp_6[0]; _b7 = bp_7[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; _b2 = bp_2[1]; _b3 = bp_3[1]; _b4 = bp_4[1]; _b5 = bp_5[1]; _b6 = bp_6[1]; _b7 = bp_7[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; _b1 = bp_1[2]; _b2 = bp_2[2]; _b3 = bp_3[2]; _b4 = bp_4[2]; _b5 = bp_5[2]; _b6 = bp_6[2]; _b7 = bp_7[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; _b1 = bp_1[3]; _b2 = bp_2[3]; _b3 = bp_3[3]; _b4 = bp_4[3]; _b5 = bp_5[3]; _b6 = bp_6[3]; _b7 = bp_7[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[4]; _b1 = bp_1[4]; _b2 = bp_2[4]; _b3 = bp_3[4]; _b4 = bp_4[4]; _b5 = bp_5[4]; _b6 = bp_6[4]; _b7 = bp_7[4]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[5]; _b1 = bp_1[5]; _b2 = bp_2[5]; _b3 = bp_3[5]; _b4 = bp_4[5]; _b5 = bp_5[5]; _b6 = bp_6[5]; _b7 = bp_7[5]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[6]; _b1 = bp_1[6]; _b2 = bp_2[6]; _b3 = bp_3[6]; _b4 = bp_4[6]; _b5 = bp_5[6]; _b6 = bp_6[6]; _b7 = bp_7[6]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[7]; _b1 = bp_1[7]; _b2 = bp_2[7]; _b3 = bp_3[7]; _b4 = bp_4[7]; _b5 = bp_5[7]; _b6 = bp_6[7]; _b7 = bp_7[7]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[8]; _b1 = bp_1[8]; _b2 = bp_2[8]; _b3 = bp_3[8]; _b4 = bp_4[8]; _b5 = bp_5[8]; _b6 = bp_6[8]; _b7 = bp_7[8]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[9]; _b1 = bp_1[9]; _b2 = bp_2[9]; _b3 = bp_3[9]; _b4 = bp_4[9]; _b5 = bp_5[9]; _b6 = bp_6[9]; _b7 = bp_7[9]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[10]; _b1 = bp_1[10]; _b2 = bp_2[10]; _b3 = bp_3[10]; _b4 = bp_4[10]; _b5 = bp_5[10]; _b6 = bp_6[10]; _b7 = bp_7[10]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[11]; _b1 = bp_1[11]; _b2 = bp_2[11]; _b3 = bp_3[11]; _b4 = bp_4[11]; _b5 = bp_5[11]; _b6 = bp_6[11]; _b7 = bp_7[11]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[12]; _b1 = bp_1[12]; _b2 = bp_2[12]; _b3 = bp_3[12]; _b4 = bp_4[12]; _b5 = bp_5[12]; _b6 = bp_6[12]; _b7 = bp_7[12]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[13]; _b1 = bp_1[13]; _b2 = bp_2[13]; _b3 = bp_3[13]; _b4 = bp_4[13]; _b5 = bp_5[13]; _b6 = bp_6[13]; _b7 = bp_7[13]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[14]; _b1 = bp_1[14]; _b2 = bp_2[14]; _b3 = bp_3[14]; _b4 = bp_4[14]; _b5 = bp_5[14]; _b6 = bp_6[14]; _b7 = bp_7[14]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[15]; _b1 = bp_1[15]; _b2 = bp_2[15]; _b3 = bp_3[15]; _b4 = bp_4[15]; _b5 = bp_5[15]; _b6 = bp_6[15]; _b7 = bp_7[15]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;

            bp_0+=16;bp_1+=16;bp_2+=16;bp_3+=16;bp_4+=16;bp_5+=16;bp_6+=16;bp_7+=16;
         }
         if (k_marg_el & 0x8) {
            /* Fixed M,K,N = 1,8,8 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; _b4 = bp_4[0]; _b5 = bp_5[0]; _b6 = bp_6[0]; _b7 = bp_7[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; _b2 = bp_2[1]; _b3 = bp_3[1]; _b4 = bp_4[1]; _b5 = bp_5[1]; _b6 = bp_6[1]; _b7 = bp_7[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; _b1 = bp_1[2]; _b2 = bp_2[2]; _b3 = bp_3[2]; _b4 = bp_4[2]; _b5 = bp_5[2]; _b6 = bp_6[2]; _b7 = bp_7[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; _b1 = bp_1[3]; _b2 = bp_2[3]; _b3 = bp_3[3]; _b4 = bp_4[3]; _b5 = bp_5[3]; _b6 = bp_6[3]; _b7 = bp_7[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[4]; _b1 = bp_1[4]; _b2 = bp_2[4]; _b3 = bp_3[4]; _b4 = bp_4[4]; _b5 = bp_5[4]; _b6 = bp_6[4]; _b7 = bp_7[4]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[5]; _b1 = bp_1[5]; _b2 = bp_2[5]; _b3 = bp_3[5]; _b4 = bp_4[5]; _b5 = bp_5[5]; _b6 = bp_6[5]; _b7 = bp_7[5]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[6]; _b1 = bp_1[6]; _b2 = bp_2[6]; _b3 = bp_3[6]; _b4 = bp_4[6]; _b5 = bp_5[6]; _b6 = bp_6[6]; _b7 = bp_7[6]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[7]; _b1 = bp_1[7]; _b2 = bp_2[7]; _b3 = bp_3[7]; _b4 = bp_4[7]; _b5 = bp_5[7]; _b6 = bp_6[7]; _b7 = bp_7[7]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;

            bp_0+=8;bp_1+=8;bp_2+=8;bp_3+=8;bp_4+=8;bp_5+=8;bp_6+=8;bp_7+=8;
         }
         if (k_marg_el & 0x4) {
            /* Fixed M,K,N = 1,4,8 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; _b4 = bp_4[0]; _b5 = bp_5[0]; _b6 = bp_6[0]; _b7 = bp_7[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; _b2 = bp_2[1]; _b3 = bp_3[1]; _b4 = bp_4[1]; _b5 = bp_5[1]; _b6 = bp_6[1]; _b7 = bp_7[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; _b1 = bp_1[2]; _b2 = bp_2[2]; _b3 = bp_3[2]; _b4 = bp_4[2]; _b5 = bp_5[2]; _b6 = bp_6[2]; _b7 = bp_7[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; _b1 = bp_1[3]; _b2 = bp_2[3]; _b3 = bp_3[3]; _b4 = bp_4[3]; _b5 = bp_5[3]; _b6 = bp_6[3]; _b7 = bp_7[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;

            bp_0+=4;bp_1+=4;bp_2+=4;bp_3+=4;bp_4+=4;bp_5+=4;bp_6+=4;bp_7+=4;
         }
         if (k_marg_el & 0x2) {
            /* Fixed M,K,N = 1,2,8 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; _b4 = bp_4[0]; _b5 = bp_5[0]; _b6 = bp_6[0]; _b7 = bp_7[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; _b2 = bp_2[1]; _b3 = bp_3[1]; _b4 = bp_4[1]; _b5 = bp_5[1]; _b6 = bp_6[1]; _b7 = bp_7[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;

            bp_0+=2;bp_1+=2;bp_2+=2;bp_3+=2;bp_4+=2;bp_5+=2;bp_6+=2;bp_7+=2;
         }
         if (k_marg_el & 0x1) {
            /* Fixed M,K,N = 1,1,8 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; _b4 = bp_4[0]; _b5 = bp_5[0]; _b6 = bp_6[0]; _b7 = bp_7[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; c0_4 += _b4*_a0; c0_5 += _b5*_a0; c0_6 += _b6*_a0; c0_7 += _b7*_a0; 
            ap += Astride;

         }
         _cp=cp;_cp[0]+=alpha*c0_0;_cp[1]+=alpha*c0_1;_cp[2]+=alpha*c0_2;_cp[3]+=alpha*c0_3;_cp[4]+=alpha*c0_4;_cp[5]+=alpha*c0_5;_cp[6]+=alpha*c0_6;_cp[7]+=alpha*c0_7;
         b+=Bstride*8;
         cp+=8;
      }
      if (n_marg_el & 0x4) {
         register double _b0,_b1,_b2,_b3;
         register double _a0;
         double *_cp;
         ap=a;
         bp_0 = b;
         bp_1 = bp_0 + Bstride;
         bp_2 = bp_1 + Bstride;
         bp_3 = bp_2 + Bstride;
         c0_0 = 0.0; c0_1 = 0.0; c0_2 = 0.0; c0_3 = 0.0; 
         for (;ap!=ap_endp; bp_0+=20,bp_1+=20,bp_2+=20,bp_3+=20) {
            /* Fixed M,K,N = 1,20,4 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; _b2 = bp_2[1]; _b3 = bp_3[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; _b1 = bp_1[2]; _b2 = bp_2[2]; _b3 = bp_3[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; _b1 = bp_1[3]; _b2 = bp_2[3]; _b3 = bp_3[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[4]; _b1 = bp_1[4]; _b2 = bp_2[4]; _b3 = bp_3[4]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[5]; _b1 = bp_1[5]; _b2 = bp_2[5]; _b3 = bp_3[5]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[6]; _b1 = bp_1[6]; _b2 = bp_2[6]; _b3 = bp_3[6]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[7]; _b1 = bp_1[7]; _b2 = bp_2[7]; _b3 = bp_3[7]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[8]; _b1 = bp_1[8]; _b2 = bp_2[8]; _b3 = bp_3[8]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[9]; _b1 = bp_1[9]; _b2 = bp_2[9]; _b3 = bp_3[9]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[10]; _b1 = bp_1[10]; _b2 = bp_2[10]; _b3 = bp_3[10]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[11]; _b1 = bp_1[11]; _b2 = bp_2[11]; _b3 = bp_3[11]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[12]; _b1 = bp_1[12]; _b2 = bp_2[12]; _b3 = bp_3[12]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[13]; _b1 = bp_1[13]; _b2 = bp_2[13]; _b3 = bp_3[13]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[14]; _b1 = bp_1[14]; _b2 = bp_2[14]; _b3 = bp_3[14]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[15]; _b1 = bp_1[15]; _b2 = bp_2[15]; _b3 = bp_3[15]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[16]; _b1 = bp_1[16]; _b2 = bp_2[16]; _b3 = bp_3[16]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[17]; _b1 = bp_1[17]; _b2 = bp_2[17]; _b3 = bp_3[17]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[18]; _b1 = bp_1[18]; _b2 = bp_2[18]; _b3 = bp_3[18]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[19]; _b1 = bp_1[19]; _b2 = bp_2[19]; _b3 = bp_3[19]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;

         }
         if (k_marg_el & 0x10) {
            /* Fixed M,K,N = 1,16,4 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; _b2 = bp_2[1]; _b3 = bp_3[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; _b1 = bp_1[2]; _b2 = bp_2[2]; _b3 = bp_3[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; _b1 = bp_1[3]; _b2 = bp_2[3]; _b3 = bp_3[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[4]; _b1 = bp_1[4]; _b2 = bp_2[4]; _b3 = bp_3[4]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[5]; _b1 = bp_1[5]; _b2 = bp_2[5]; _b3 = bp_3[5]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[6]; _b1 = bp_1[6]; _b2 = bp_2[6]; _b3 = bp_3[6]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[7]; _b1 = bp_1[7]; _b2 = bp_2[7]; _b3 = bp_3[7]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[8]; _b1 = bp_1[8]; _b2 = bp_2[8]; _b3 = bp_3[8]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[9]; _b1 = bp_1[9]; _b2 = bp_2[9]; _b3 = bp_3[9]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[10]; _b1 = bp_1[10]; _b2 = bp_2[10]; _b3 = bp_3[10]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[11]; _b1 = bp_1[11]; _b2 = bp_2[11]; _b3 = bp_3[11]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[12]; _b1 = bp_1[12]; _b2 = bp_2[12]; _b3 = bp_3[12]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[13]; _b1 = bp_1[13]; _b2 = bp_2[13]; _b3 = bp_3[13]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[14]; _b1 = bp_1[14]; _b2 = bp_2[14]; _b3 = bp_3[14]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[15]; _b1 = bp_1[15]; _b2 = bp_2[15]; _b3 = bp_3[15]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;

            bp_0+=16;bp_1+=16;bp_2+=16;bp_3+=16;
         }
         if (k_marg_el & 0x8) {
            /* Fixed M,K,N = 1,8,4 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; _b2 = bp_2[1]; _b3 = bp_3[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; _b1 = bp_1[2]; _b2 = bp_2[2]; _b3 = bp_3[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; _b1 = bp_1[3]; _b2 = bp_2[3]; _b3 = bp_3[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[4]; _b1 = bp_1[4]; _b2 = bp_2[4]; _b3 = bp_3[4]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[5]; _b1 = bp_1[5]; _b2 = bp_2[5]; _b3 = bp_3[5]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[6]; _b1 = bp_1[6]; _b2 = bp_2[6]; _b3 = bp_3[6]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[7]; _b1 = bp_1[7]; _b2 = bp_2[7]; _b3 = bp_3[7]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;

            bp_0+=8;bp_1+=8;bp_2+=8;bp_3+=8;
         }
         if (k_marg_el & 0x4) {
            /* Fixed M,K,N = 1,4,4 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; _b2 = bp_2[1]; _b3 = bp_3[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; _b1 = bp_1[2]; _b2 = bp_2[2]; _b3 = bp_3[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; _b1 = bp_1[3]; _b2 = bp_2[3]; _b3 = bp_3[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;

            bp_0+=4;bp_1+=4;bp_2+=4;bp_3+=4;
         }
         if (k_marg_el & 0x2) {
            /* Fixed M,K,N = 1,2,4 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; _b2 = bp_2[1]; _b3 = bp_3[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;

            bp_0+=2;bp_1+=2;bp_2+=2;bp_3+=2;
         }
         if (k_marg_el & 0x1) {
            /* Fixed M,K,N = 1,1,4 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; _b2 = bp_2[0]; _b3 = bp_3[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; c0_2 += _b2*_a0; c0_3 += _b3*_a0; 
            ap += Astride;

         }
         _cp=cp;_cp[0]+=alpha*c0_0;_cp[1]+=alpha*c0_1;_cp[2]+=alpha*c0_2;_cp[3]+=alpha*c0_3;
         b+=Bstride*4;
         cp+=4;
      }
      if (n_marg_el & 0x2) {
         register double _b0,_b1;
         register double _a0;
         double *_cp;
         ap=a;
         bp_0 = b;
         bp_1 = bp_0 + Bstride;
         c0_0 = 0.0; c0_1 = 0.0; 
         for (;ap!=ap_endp; bp_0+=20,bp_1+=20) {
            /* Fixed M,K,N = 1,20,2 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; _b1 = bp_1[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; _b1 = bp_1[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[4]; _b1 = bp_1[4]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[5]; _b1 = bp_1[5]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[6]; _b1 = bp_1[6]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[7]; _b1 = bp_1[7]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[8]; _b1 = bp_1[8]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[9]; _b1 = bp_1[9]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[10]; _b1 = bp_1[10]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[11]; _b1 = bp_1[11]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[12]; _b1 = bp_1[12]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[13]; _b1 = bp_1[13]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[14]; _b1 = bp_1[14]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[15]; _b1 = bp_1[15]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[16]; _b1 = bp_1[16]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[17]; _b1 = bp_1[17]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[18]; _b1 = bp_1[18]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[19]; _b1 = bp_1[19]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;

         }
         if (k_marg_el & 0x10) {
            /* Fixed M,K,N = 1,16,2 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; _b1 = bp_1[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; _b1 = bp_1[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[4]; _b1 = bp_1[4]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[5]; _b1 = bp_1[5]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[6]; _b1 = bp_1[6]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[7]; _b1 = bp_1[7]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[8]; _b1 = bp_1[8]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[9]; _b1 = bp_1[9]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[10]; _b1 = bp_1[10]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[11]; _b1 = bp_1[11]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[12]; _b1 = bp_1[12]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[13]; _b1 = bp_1[13]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[14]; _b1 = bp_1[14]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[15]; _b1 = bp_1[15]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;

            bp_0+=16;bp_1+=16;
         }
         if (k_marg_el & 0x8) {
            /* Fixed M,K,N = 1,8,2 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; _b1 = bp_1[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; _b1 = bp_1[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[4]; _b1 = bp_1[4]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[5]; _b1 = bp_1[5]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[6]; _b1 = bp_1[6]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[7]; _b1 = bp_1[7]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;

            bp_0+=8;bp_1+=8;
         }
         if (k_marg_el & 0x4) {
            /* Fixed M,K,N = 1,4,2 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; _b1 = bp_1[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; _b1 = bp_1[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;

            bp_0+=4;bp_1+=4;
         }
         if (k_marg_el & 0x2) {
            /* Fixed M,K,N = 1,2,2 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; _b1 = bp_1[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;

            bp_0+=2;bp_1+=2;
         }
         if (k_marg_el & 0x1) {
            /* Fixed M,K,N = 1,1,2 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; _b1 = bp_1[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; c0_1 += _b1*_a0; 
            ap += Astride;

         }
         _cp=cp;_cp[0]+=alpha*c0_0;_cp[1]+=alpha*c0_1;
         b+=Bstride*2;
         cp+=2;
      }
      if (n_marg_el & 0x1) {
         register double _b0;
         register double _a0;
         double *_cp;
         ap=a;
         bp_0 = b;
         c0_0 = 0.0; 
         for (;ap!=ap_endp; bp_0+=20) {
            /* Fixed M,K,N = 1,20,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[4]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[5]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[6]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[7]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[8]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[9]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[10]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[11]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[12]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[13]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[14]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[15]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[16]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[17]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[18]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[19]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;

         }
         if (k_marg_el & 0x10) {
            /* Fixed M,K,N = 1,16,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[4]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[5]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[6]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[7]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[8]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[9]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[10]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[11]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[12]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[13]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[14]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[15]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;

            bp_0+=16;
         }
         if (k_marg_el & 0x8) {
            /* Fixed M,K,N = 1,8,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[4]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[5]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[6]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[7]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;

            bp_0+=8;
         }
         if (k_marg_el & 0x4) {
            /* Fixed M,K,N = 1,4,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[2]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[3]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;

            bp_0+=4;
         }
         if (k_marg_el & 0x2) {
            /* Fixed M,K,N = 1,2,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;
            
            _b0 = bp_0[1]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;

            bp_0+=2;
         }
         if (k_marg_el & 0x1) {
            /* Fixed M,K,N = 1,1,1 fully-unrolled matrix matrix multiply. */
            
            _b0 = bp_0[0]; 
            _a0 = ap[0];
            c0_0 += _b0*_a0; 
            ap += Astride;

         }
         _cp=cp;_cp[0]+=alpha*c0_0;
      }
   }
}
