SSe programming is whery interesting fromthat point that there are parallely 4 numbers that are porcessed.SSE has registers of size 128 bits. They can handle 4 floats.GCC C there is no default type for 128 bits and we define our ownstructure for that.
typedef struct xmm
{
float a;
float b;
float c;
float d;
} xmm __attribute__ ((aligned (16)));structure is aligned for perfomance.to make 4byted value + 4byte valuewe need to load values:
movaps xmm0, [eax]
movaps xmm1, [ebx]
and add them
addps xmm0,xmm1
after that store somewhere
movaps [eax], xmm0
Final test program in C looks like:
typedef struct xmm
{
float a;
float b;
float c;
float d;
} xmm __attribute__ ((aligned (16)));
extern void sse_add( xmm *, xmm * );
int main( int argc, char **argv)
{
xmm x0,x1;
x0.a = 1.0;
x0.b = 2.0;
x0.c = 3.0;
x0.d = 4.0;
x1.a = x1.b = x1.c = x1.d = 5.0;
printf("%10f %10f %10f %10f\n",x0.a,x0.b,x0.c,x0.d);
printf("%10f %10f %10f %10f\n",x1.a,x1.b,x1.c,x1.d);
sse_add( &x0 , &x1 );
printf("%10f %10f %10f %10f\n",x0.a,x0.b,x0.c,x0.d);
printf("%10f %10f %10f %10f\n",x1.a,x1.b,x1.c,x1.d);
return 0;
}gcc main.c add.o -o main And asm example
format ELF
section '.text'
public sse_add
align 4
sse_add:
;arguments that are pointers for 2 xmm data blocks
x0 equ [ebp+8]
x1 equ [ebp+12]
push ebp
mov ebp, esp
mov eax, x0
mov ebx, x1
;load in xmm0 and xmm1 values
;if values where not aligned than we would used other instruction
movaps xmm0, [eax]
movaps xmm1, [ebx]
;sum up and save inside xmm0
addps xmm0,xmm1
;save value in first argument
movaps [eax], xmm0
pop ebp
retfasm add.asm add.o