.file "mmbi.c" .text .globl matmulinner .type matmulinner, @function #rdi=a, rsi=b, rdx=c, rcx=m, r8=p #callee-saved: rbx, rsp, rbp, r12-r15 #caller-saved: rax, r9-r11 #r9 =p*1*32 #r10=p*2*32 #r11=p*3*32 matmulinner: .LFB2464: .cfi_startproc vbroadcastsd (%rdi), %ymm0 vbroadcastsd 8(%rdi), %ymm1 vbroadcastsd 16(%rdi), %ymm2 vbroadcastsd 24(%rdi), %ymm3 leaq (%rdi,%rcx,8), %rdi vmovapd (%rdi), %ymm4 leaq (%rdi,%rcx,8), %rdi vbroadcastsd (%rdi), %ymm5 vbroadcastsd 8(%rdi), %ymm6 vbroadcastsd 16(%rdi), %ymm7 vbroadcastsd 24(%rdi), %ymm8 vmovapd (%rdi,%rcx,8), %ymm9 testq %r8, %r8 je .L1 movq %r8, %r9 salq $5, %r9 leaq (%r9,%r9),%r10 leaq (%r9,%r9,2),%r11 .L3: #ymm10-13=ci0j-ci3j, ymm14=bkxj, ymm15=aixkx vmovapd (%rsi,%r11), %ymm14 vmovapd (%rdx), %ymm10 vfmadd231pd %ymm3, %ymm14, %ymm10 vpermpd $0xff, %ymm4, %ymm15 vmovapd (%rdx,%r9), %ymm11 vfmadd231pd %ymm15, %ymm14, %ymm11 vmovapd (%rdx,%r10), %ymm12 vfmadd231pd %ymm8 , %ymm14, %ymm12 vpermpd $0xff, %ymm9, %ymm15 vmovapd (%rdx,%r11), %ymm13 vfmadd231pd %ymm15, %ymm14, %ymm13 vmovapd (%rsi,%r10), %ymm14 vfmadd231pd %ymm2 , %ymm14, %ymm10 vpermpd $0xaa, %ymm4, %ymm15 vfmadd231pd %ymm15, %ymm14, %ymm11 vfmadd231pd %ymm7 , %ymm14, %ymm12 vpermpd $0xaa, %ymm9, %ymm15 vfmadd231pd %ymm15, %ymm14, %ymm13 vmovapd (%rsi,%r9), %ymm14 vfmadd231pd %ymm1, %ymm14, %ymm10 vpermpd $0x55, %ymm4, %ymm15 vfmadd231pd %ymm15, %ymm14, %ymm11 vfmadd231pd %ymm6 , %ymm14, %ymm12 vpermpd $0x55, %ymm9, %ymm15 vfmadd231pd %ymm15, %ymm14, %ymm13 vmovapd (%rsi), %ymm14 vfmadd231pd %ymm0, %ymm14, %ymm10 vmovapd %ymm10, (%rdx) vpermpd $0x00, %ymm4, %ymm15 vfmadd231pd %ymm15, %ymm14, %ymm11 vmovapd %ymm11, (%rdx,%r9) vfmadd231pd %ymm5 , %ymm14, %ymm12 vmovapd %ymm12, (%rdx,%r10) vpermpd $0x00, %ymm9, %ymm15 vfmadd231pd %ymm15, %ymm14, %ymm13 vmovapd %ymm13, (%rdx,%r11) addq $32, %rsi addq $32, %rdx dec %r8 jne .L3 .L1: ret .cfi_endproc .LFE2464: .size matmulinner, .-matmulinner .ident "GCC: (Debian 4.9.2-10) 4.9.2" .section .note.GNU-stack,"",@progbits