.file "mmbi.c" .text .globl matmulinner .type matmulinner, @function #rdi=a, rsi=b, rdx=c, rcx=m, r8=p #callee-saved: rbx, rsp, rbp, r12-r15 #caller-saved: rax, r9-r11 #r9 =p*1*32 #r10=p*2*32 #r11=p*3*32 matmulinner: .LFB2464: .cfi_startproc vbroadcastsd (%rdi), %ymm0 vbroadcastsd 8(%rdi), %ymm1 vbroadcastsd 16(%rdi), %ymm2 vbroadcastsd 24(%rdi), %ymm3 leaq (%rdi,%rcx,8), %rdi vmovapd (%rdi), %ymm4 leaq (%rdi,%rcx,8), %rdi vbroadcastsd (%rdi), %ymm5 vbroadcastsd 8(%rdi), %ymm6 vbroadcastsd 16(%rdi), %ymm7 vbroadcastsd 24(%rdi), %ymm8 vmovapd (%rdi,%rcx,8), %ymm9 testq %r8, %r8 je .L1 movq %r8, %r9 salq $5, %r9 leaq (%r9,%r9),%r10 leaq (%r9,%r9,2),%r11 .L3: vmovapd (%rsi), %ymm10 vmovapd (%rsi,%r9), %ymm11 vmovapd (%rsi,%r10), %ymm12 vmovapd (%rsi,%r11), %ymm13 addq $32, %rsi vmovapd (%rdx), %ymm14 vfmadd231pd %ymm3, %ymm13, %ymm14 vfmadd231pd %ymm2, %ymm12, %ymm14 vfmadd231pd %ymm1, %ymm11, %ymm14 vfmadd231pd %ymm0, %ymm10, %ymm14 vmovapd %ymm14, (%rdx) vmovapd (%rdx,%r9), %ymm14 vpermpd $0xff, %ymm4, %ymm15 vfmadd231pd %ymm15, %ymm13, %ymm14 vpermpd $0xaa, %ymm4, %ymm15 vfmadd231pd %ymm15, %ymm12, %ymm14 vpermpd $0x55, %ymm4, %ymm15 vfmadd231pd %ymm15, %ymm11, %ymm14 vpermpd $0x00, %ymm4, %ymm15 vfmadd231pd %ymm15, %ymm10, %ymm14 vmovapd %ymm14, (%rdx,%r9) vmovapd (%rdx,%r10), %ymm14 vfmadd231pd %ymm8, %ymm13, %ymm14 vfmadd231pd %ymm7, %ymm12, %ymm14 vfmadd231pd %ymm6, %ymm11, %ymm14 vfmadd231pd %ymm5, %ymm10, %ymm14 vmovapd %ymm14, (%rdx,%r10) vmovapd (%rdx,%r11), %ymm14 vpermpd $0xff, %ymm9, %ymm15 vfmadd231pd %ymm15, %ymm13, %ymm14 vpermpd $0xaa, %ymm9, %ymm15 vfmadd231pd %ymm15, %ymm12, %ymm14 vpermpd $0x55, %ymm9, %ymm15 vfmadd231pd %ymm15, %ymm11, %ymm14 vpermpd $0x00, %ymm9, %ymm15 vfmadd231pd %ymm15, %ymm10, %ymm14 vmovapd %ymm14, (%rdx,%r11) addq $32, %rdx dec %r8 jne .L3 .L1: ret .cfi_endproc .LFE2464: .size matmulinner, .-matmulinner .ident "GCC: (Debian 4.9.2-10) 4.9.2" .section .note.GNU-stack,"",@progbits