########################################################################
# ISPACK FORTRAN SUBROUTINE LIBRARY FOR SCIENTIFIC COMPUTING
# Copyright (C) 1998--2017 Keiichi Ishioka <ishioka@gfd-dennou.org>
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
# 
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 USA.
########################################################################
.text
.globl lvoswg_
.globl _lvoswg_	
lvoswg_:
_lvoswg_:	
	movl   (%rdi), %edi  # : JB  rdi 

	# JB: rdi
        # AC: rsi	
	# SD: rdx	
	# Q: rcx

	shlq $6,%rdi # JB*8*8  rsi 
	movq %rdi,%r8	
	addq %r8,%r8
	addq %r8,%r8
	addq %r8,%r8	
	subq %rdi,%r8	# r8  JB*8*8*7 
	
	addq %rcx,%r8

	vbroadcastsd   (%rdx),%zmm0 # SD1R  zmm0 8ս
	vbroadcastsd 16(%rdx),%zmm1 # SD2R  zmm1 8ս
	vbroadcastsd 32(%rdx),%zmm2 # SD3R  zmm2 8ս
	vbroadcastsd 48(%rdx),%zmm3 # SD4R  zmm3 8ս

	vbroadcastsd  8(%rdx),%zmm4 # SD1I  zmm0 8ս
	vbroadcastsd 24(%rdx),%zmm5 # SD2I  zmm1 8ս	
	vbroadcastsd 40(%rdx),%zmm6 # SD3I  zmm2 8ս
	vbroadcastsd 56(%rdx),%zmm7 # SD4I  zmm3 8ս

	vbroadcastsd   (%rsi),%zmm8 # AC1  zmm0 8ս
	vbroadcastsd  8(%rsi),%zmm9 # AC2  zmm1 8ս
	vbroadcastsd 16(%rsi),%zmm10 # AC3  zmm2 8ս
	vbroadcastsd 24(%rsi),%zmm11 # AC4  zmm3 8ս

	movq %rcx,%rax

	movq %rsp,%rdx
	shrq $6,%rdx 
	shlq $6,%rdx # rdx β 6ӥåȤ򥯥ꥢ

	vmovapd %zmm0, -64(%rdx)
	vmovapd %zmm1,-128(%rdx)
	vmovapd %zmm4,-192(%rdx)
	vmovapd %zmm5,-256(%rdx)				

L00:
	vmovapd   (%rax),%zmm14 # X2
	vmovapd   %zmm14,%zmm15 # X2			
	vmovapd 64(%rax),%zmm12 # Q1
	vmovapd 128(%rax),%zmm13 # Q2

	vfmadd213pd %zmm9,%zmm8,%zmm15 # zmm15 = zmm8 * zmm15 + zmm9 
	vfmadd213pd %zmm12,%zmm13,%zmm15 # zmm15 = zmm13 * zmm15 + zmm12
	vmovapd %zmm15,64(%rax) # Q1			

	vfmadd213pd %zmm11,%zmm10,%zmm14 # zmm14 = zmm10 * zmm14 + zmm11
	vfmadd213pd %zmm13,%zmm15,%zmm14 # zmm14 = zmm15 * zmm14 + zmm13
	vmovapd %zmm14,128(%rax) # Q2
	
	vmovapd 192(%rax),%zmm14 # G1R
#	vfmadd231pd %zmm1,%zmm12,%zmm14 # zmm14 = zmm1 * zmm12 + zmm14
	vfmadd231pd -128(%rdx),%zmm12,%zmm14 # zmm14 = zmm1 * zmm12 + zmm14
	vfmadd231pd %zmm3,%zmm13,%zmm14 # zmm14 = zmm3 * zmm13 + zmm14	
	vmovapd %zmm14,192(%rax)

	vmovapd 256(%rax),%zmm14 # G2R
#	vfmadd231pd %zmm0,%zmm12,%zmm14 # zmm14 = zmm0 * zmm12 + zmm14
	vfmadd231pd -64(%rdx),%zmm12,%zmm14 # zmm14 = zmm0 * zmm12 + zmm14
	vfmadd231pd %zmm2,%zmm13,%zmm14 # zmm14 = zmm2 * zmm13 + zmm14
	vmovapd %zmm14,256(%rax)

	vmovapd 320(%rax),%zmm14 # G1I
#	vfmadd231pd %zmm5,%zmm12,%zmm14 # zmm14 = zmm5 * zmm12 + zmm14
	vfmadd231pd -256(%rdx),%zmm12,%zmm14 # zmm14 = zmm5 * zmm12 + zmm14
	vfmadd231pd %zmm7,%zmm13,%zmm14 # zmm14 = zmm7 * zmm13 + zmm14	
	vmovapd %zmm14,320(%rax)

	vmovapd 384(%rax),%zmm14 # G2I
#	vfmadd231pd %zmm4,%zmm12,%zmm14 # zmm14 = zmm4 * zmm12 + zmm14
	vfmadd231pd -192(%rdx),%zmm12,%zmm14 # zmm14 = zmm4 * zmm12 + zmm14
	vfmadd231pd %zmm6,%zmm13,%zmm14 # zmm14 = zmm6 * zmm13 + zmm14	
	vmovapd %zmm14,384(%rax)

	addq $448,%rax
	cmpq %rax,%r8
	jne L00

	ret
