#include #include typedef double v4d __attribute__ ((vector_size (32))); /* assumptions: p divisible by 4, b and c are 32-byte-aligned */ void inner(double a[], v4d b[], v4d c[], size_t m, size_t n, size_t p) { size_t i,j,k; double r; p=p/4; i=0,k=0; double aik0 = a[i*m+k+0]; double aik1 = a[i*m+k+1]; double aik2 = a[i*m+k+2]; double aik3 = a[i*m+k+3]; for (j=0; j