#include #include typedef double v4d __attribute__ ((vector_size (32))); /* assumptions: p divisible by 4, b and c are 32-byte-aligned */ void inner(double a[], v4d b[], v4d c[], size_t m, size_t n, size_t p) { size_t i,j,k; double r; p=p/4; i=0,k=0; double ai0k0 = a[(i+0)*m+k+0]; double ai0k1 = a[(i+0)*m+k+1]; double ai0k2 = a[(i+0)*m+k+2]; double ai0k3 = a[(i+0)*m+k+3]; double ai1k0 = a[(i+1)*m+k+0]; double ai1k1 = a[(i+1)*m+k+1]; double ai1k2 = a[(i+1)*m+k+2]; double ai1k3 = a[(i+1)*m+k+3]; for (j=0; j