#include #include typedef double v4d __attribute__ ((vector_size (32))); /* assumptions: p divisible by 4, b and c are 32-byte-aligned */ void matmul(double a[], v4d b[], v4d c[], size_t m, size_t n, size_t p) { size_t i,j,k; double r; p=p/4; memset(c,0,n*p*sizeof(double)); for (i=0; i