#include <stdlib.h>
#include <string.h>

typedef double v4d __attribute__ ((vector_size (32)));

/* assumptions: p divisible by 4, b and c are 32-byte-aligned */
void matmul(double a[], v4d b[], v4d c[], size_t m, size_t n, size_t p)
{
  size_t i,j,k;
  double r;
  p=p/4;
  memset(c,0,n*p*sizeof(double));
  for (i=0; i<n; i++)
    for (k=0; k<m; k++) {
      double aik = a[i*m+k];
      for (j=0; j<p; j++)
        c[i*p+j] += aik*b[k*p+j];
    }
}