# asblas.s:
#
# Vector Operations in VAX Assembler
# Single Precision Version
# Loops unrolled 4 times
# Compile on UNIX or VMS using the UNIX assembler.
# cc -c asblas.s
#
# For documentation, see file toblas.c
#
# Oliver McBryan
# New York University
#
#
#
# zero_vector(n,v)
# v[i] = 0.
# assume n multiple of 4
.text
.align 1
.globl _szv
_szv:
.word 0xc00
ashl $-2,4(ap),r11 # n / 4
movl 8(ap),r10 # v
jbr a2
a1:
clrf (r10)+
clrf (r10)+
clrf (r10)+
clrf (r10)+
a2:
sobgeq r11,a1
ret
# vector_equals_scalar(n,v,a)
# v[i] = a
# assume n multiple of 4
.text
.align 1
.globl _sves
_sves:
.word 0xc00
ashl $-2,4(ap),r11 # n / 4
movl 8(ap),r10 # v
cvtdf 12(ap),r0 # a
jbr b2
b1:
movf r0,(r10)+
movf r0,(r10)+
movf r0,(r10)+
movf r0,(r10)+
b2:
sobgeq r11,b1
ret
# float sum_elements_of_vector(n,v)
# assume n mulitple of 4
.text
.align 1
.globl _svsum
_svsum:
.word 0xc00
ashl $-2,4(ap),r11 # n / 4
movl 8(ap),r10 # v
clrf r0
jbr c2
c1:
addf3 (r10)+,(r10)+,r1
addf2 r1,r0
addf3 (r10)+,(r10)+,r1
addf2 r1,r0
c2:
sobgeq r11,c1
cvtfd r0,r0
ret
# copy_vector_to_vector(n,v1,v2)
# v2[i] = v1[i]
.text
.align 1
.globl _svev
_svev:
.word 0x0
ashl $2,4(ap),r0 # n * 4
movc3 r0,*8(ap),*12(ap)
ret
# add_scalar_to_vector(n,a,v)
# v[i] = v[i] + a
# assume n multiple of 4
.text
.align 1
.globl _svas
_svas:
.word 0xc00
ashl $-2,4(ap),r11 # n / 4
cvtdf 8(ap),r0 # a
movl 16(ap),r10 # v
jbr d2
addf3 r0,(r10),(r10)+
addf3 r0,(r10),(r10)+
addf3 r0,(r10),(r10)+
addf3 r0,(r10),(r10)+
sobgeq r11,d1
ret
# multiply_vector_by_scalar(n,v,a)
# v[i] *= a
# n is a multiple of 4
.text
.align 1
.globl _svms
_svms:
.word 0xc00
ashl $-2,4(ap),r11 # n / 4
movl 8(ap),r10 # v
cvtdf 12(ap),r0 # a
jbr e2
e1:
mulf3 r0,(r10),(r10)+
mulf3 r0,(r10),(r10)+
mulf3 r0,(r10),(r10)+
mulf3 r0,(r10),(r10)+
e2:
sobgeq r11,e1
ret
# vector_equals_scalar_plus_vector(n,v1,a,v2)
# v1[i] = v2[i] + a
# n is a multiple of 4
.text
.align 1
.globl _svespv
_svespv:
.word 0xe00
ashl $-2,4(ap),r11 # n / 4
movl 8(ap),r10 # v1
cvtdf 12(ap),r0 # a
movl 20(ap),r9 # v2
jbr f2
f1:
addf3 r0,(r9)+,(r10)+
addf3 r0,(r9)+,(r10)+
addf3 r0,(r9)+,(r10)+
addf3 r0,(r9)+,(r10)+
f2:
sobgeq r11,f1
ret
# vector_equals_scalar_times_vector(n,v1,a,v2)
# v1[i] = v2[i] * a
# n is a multiple of 4
.text
.align 1
.globl _svesmv
_svesmv:
.word 0xe00
ashl $-2,4(ap),r11 # n / 4
movl 8(ap),r10 # v1
cvtdf 12(ap),r0 # a
movl 20(ap),r9 # v2
jbr g2
mulf3 r0,(r9)+,(r10)+
mulf3 r0,(r9)+,(r10)+
mulf3 r0,(r9)+,(r10)+
mulf3 r0,(r9)+,(r10)+
sobgeq r11,g1
ret
# float inner_product(n,v1,v2)
# return v1.v2
# n multiple of 4
.globl _svdotv
_svdotv:
.word 0xe00
ashl $-2,4(ap),r11 # n / 4
movl 8(ap),r10 # v1
movl 12(ap),r9 # v2
clrf r0
jbr h2
mulf3 (r9)+,(r10)+,r1
addf2 r1,r0
mulf3 (r9)+,(r10)+,r1
addf2 r1,r0
mulf3 (r9)+,(r10)+,r1
addf2 r1,r0
mulf3 (r9)+,(r10)+,r1
addf2 r1,r0
sobgeq r11,h1
cvtfd r0,r0
ret
# multiply_vector_by_vector(n,v1,v2)
# v1[i] = v1[i]*v2[i]
# n multiple of 4
.globl _svmv
_svmv:
.word 0xe00
ashl $-2,4(ap),r11 # n / 4
movl 8(ap),r10 # v1
movl 12(ap),r9 # v2
jbr i2
1:
mulf3 (r9)+,(r10),(r10)+
mulf3 (r9)+,(r10),(r10)+
mulf3 (r9)+,(r10),(r10)+
mulf3 (r9)+,(r10),(r10)+
2:
sobgeq r11,i1
ret
# divide_vector_by_vector(n,v1,v2)
# v1[i] = v1[i]/v2[i]
# n multiple of 4
.globl _svdv
_svdv:
.word 0xe00
ashl $-2,4(ap),r11 # n / 4
movl 8(ap),r10 # v1
movl 12(ap),r9 # v2
jbr j2
divf3 (r9)+,(r10),(r10)+
divf3 (r9)+,(r10),(r10)+
divf3 (r9)+,(r10),(r10)+
divf3 (r9)+,(r10),(r10)+
sobgeq r11,j1
ret
# subtract_vector_from_vector(n,v1,v2)
# v2[i] = v2[i] - v1[i]
# n multiple of 4
.globl _svlv
_svlv:
.word 0xe00
ashl $-2,4(ap),r11 # n / 4
movl 8(ap),r10 # v1
movl 12(ap),r9 # v2
jbr k2
k1:
subf3 (r10)+,(r9),(r9)+
subf3 (r10)+,(r9),(r9)+
subf3 (r10)+,(r9),(r9)+
subf3 (r10)+,(r9),(r9)+
k2:
sobgeq r11,k1
ret
# add_scalar_times_vector_to_vector(n,a,v1,v2)
# v2[i] = v2[i] + a*v1[i]
.text
.align 1
.globl _svpsv
_svpsv:
.word 0xe00
ashl $-2,4(ap),r11 # n / 4
cvtdf 8(ap),r0 # a
movl 16(ap),r10 # v1
movl 20(ap),r9 # v2
jbr l2
l1:
mulf3 r0,(r10)+,r1
addf3 r1,(r9),(r9)+
mulf3 r0,(r10)+,r1
addf3 r1,(r9),(r9)+
mulf3 r0,(r10)+,r1
addf3 r1,(r9),(r9)+
mulf3 r0,(r10)+,r1
addf3 r1,(r9),(r9)+
l2:
sobgeq r11,l1
ret
# vector_equals_vector_minus_vector(n,v1,v2,v3)
# v1[i] = v2[i] - v3[i]
.text
.align 1
.globl _svevlv
_svevlv:
.word 0xe00
ashl $-2,4(ap),r11 # n / 4
movl 8(ap),r0 # v1
movl 12(ap),r10 # v2
movl 16(ap),r9 # v3
jbr m2
m1:
subf3 (r9)+,(r10)+,(r0)+
subf3 (r9)+,(r10)+,(r0)+
subf3 (r9)+,(r10)+,(r0)+
subf3 (r9)+,(r10)+,(r0)+
m2:
sobgeq r11,m1
ret
# add_vector_times_vector_to_vector(n,v1,v2,v3)
# v3[i] = v3[i] + v1[i]*v2[i]
.text
.align 1
.globl _svpvv
_svpvv:
.word 0xf00
ashl $-2,4(ap),r11 # n / 4
movl 8(ap),r10 # v1
movl 12(ap),r9 # v2
movl 16(ap),r8 # v3
jbr n2
n1:
mulf3 (r10)+,(r9)+,r0
addf3 r0,(r8),(r8)+
mulf3 (r10)+,(r9)+,r0
addf3 r0,(r8),(r8)+
mulf3 (r10)+,(r9)+,r0
addf3 r0,(r8),(r8)+
mulf3 (r10)+,(r9)+,r0
addf3 r0,(r8),(r8)+
n2:
sobgeq r11,n1
ret
# vector_equals_vector_plus_scalar_times_vector(n,v1,v2,a,v3)
# v1[i] = v2[i] + a*v3[i]
.globl _svevpsv
_svevpsv:
.word 0xf00
ashl $-2,4(ap),r11 # n / 4
movl 8(ap),r10 # v1
movl 12(ap),r9 # v2
cvtdf 16(ap),r0 # a
movl 24(ap),r8 # v3
jbr o2
o1:
mulf3 (r8)+,r0,r1
addf3 r1,(r9)+,(r10)+
mulf3 (r8)+,r0,r1
addf3 r1,(r9)+,(r10)+
mulf3 (r8)+,r0,r1
addf3 r1,(r9)+,(r10)+
mulf3 (r8)+,r0,r1
addf3 r1,(r9)+,(r10)+
o2:
sobgeq r11,o1
ret
.