I thought it may interest people to see how quite small pieces of C can generate some rather unnerving workings when converting into the Thumb instruction set. As always, i still desperately in need of a faster method to find the top 32 bits of a 32-bit x32-bit --->64 bit multiply. Bastian Schick currently holds the record having a piece of in-line code that takes just 17 cycles. The keen watcher will know that Bastian optimised the D32PF by unrolling and applying some great code. So here is the C;
void FDCT32(int *buf, int *dest, int offset, int oddBlock, int gb){ int i, s, tmp, es; const int *cptr = dcttab; int a0, a1, a2, a3, a4, a5, a6, a7; int b0, b1, b2, b3, b4, b5, b6, b7; int *d;
/* first pass */ D32FP(0, 1, 5, 1); D32FP(1, 1, 3, 1); D32FP(2, 1, 3, 1); D32FP(3, 1, 2, 1); D32FP(4, 1, 2, 1); D32FP(5, 1, 1, 2); D32FP(6, 1, 1, 2); D32FP(7, 1, 1, 4);
/* second pass */ for (i = 4; i > 0; i--) { a0 = buf[0]; a7 = buf[7]; a3 = buf[3]; a4 = buf[4]; b0 = a0 + a7; b7 = MULSHIFT32(*cptr++, a0 - a7) << 1; b3 = a3 + a4; b4 = MULSHIFT32(*cptr++, a3 - a4) << 3; a0 = b0 + b3; a3 = MULSHIFT32(*cptr, b0 - b3) << 1; a4 = b4 + b7; a7 = MULSHIFT32(*cptr++, b7 - b4) << 1;
a1 = buf[1]; a6 = buf[6]; a2 = buf[2]; a5 = buf[5]; b1 = a1 + a6; b6 = MULSHIFT32(*cptr++, a1 - a6) << 1; b2 = a2 + a5; b5 = MULSHIFT32(*cptr++, a2 - a5) << 1; a1 = b1 + b2; a2 = MULSHIFT32(*cptr, b1 - b2) << 2; a5 = b5 + b6; a6 = MULSHIFT32(*cptr++, b6 - b5) << 2;
b0 = a0 + a1; b1 = MULSHIFT32(COS4_0, a0 - a1) << 1; b2 = a2 + a3; b3 = MULSHIFT32(COS4_0, a3 - a2) << 1; buf[0] = b0; buf[1] = b1; buf[2] = b2 + b3; buf[3] = b3;
b4 = a4 + a5; b5 = MULSHIFT32(COS4_0, a4 - a5) << 1; b6 = a6 + a7; b7 = MULSHIFT32(COS4_0, a7 - a6) << 1; b6 += b7; buf[4] = b4 + b6; buf[5] = b5 + b7; buf[6] = b5 + b6; buf[7] = b7;
buf += 8; } buf -= 32; /* reset */And for comparison, my α version of the same logical steps. You will note that the MULSHIFT32 macro is frequent and takes 17 cycles.
;Second pass of DCT
ldr r0,[r7,#0] ldr r1,[r7,#7] add r5,r0,r1 ;b0 sub r0,r1
pop r1
MULSHIFT32
lsls r0,r0,#1 mov r8,r0 ;b7
;store b0=r5 & b7=r8
ldr r0,[r7,#3] ldr r1,[r7,#4] add r6,r0,r1 ;b3 sub r0,r1 pop r1
lsls r0,r0,#3 mov r9,r0 ;b4
;store b0=r5,b3=r6,b4=r9,b7=r8
sub r0,r5,r6 add r5,r5,r6 ;a0 = r5
ldr r1,[sp,#0]
lsls r0,r0,#1 mov r10,r0 ;a3
;store a0=r5,a3=r10,b4=r9,b7=r8
mov r0,r8 mov r1,r9 add r6,r0,r1 ;a4
sub r0,r1 pop r1
lsls r0,r0,#1 mov r8,r0 ;a7
;store a0=r5,a3=r10,a4=r6,a7=r8
ldr r0,[r7,#1] ldr r1,[r7,#6] add r2,r0,r1 mov r9,r2 ;b1
lsls r0,r0,#1 mov r10,r0 ;b6
;store a0=r5,a3=r10,a4=r6,a7=r8,b1=r9,b6=r11
ldr r0,[r7,#2] ldr r1,[r7,#5] add r2,r0,r1 mov r14,r2 ;b2
lsls r0,r0,#1 mov r12,r0 ;b5
;store a0=r5,a3=r10,a4=r6,a7=r8,b1=r9,b2=r11,b5=r12,b6=r14
mov r2,r9 mov r3,r11 add r9,r2 ;a1
sub r0,r2,r3
lsls r0,r0,#2
mov r11,r0 ;a2 ;store a0=r5,a1=r9,a2=r11,a3=r10,a4=r6,a7=r8,b5=r12,b6=r14
mov r2,r12 add r2,r14
add r5,r2,r6 ;a1
sub r0,r2,r6
lsls r0,r0,#2 mov r6,r0 ;a2
;store a0=r5,a1=r9,a2=r6,a3=r10,a4=r6,a7=r8,b5=r12,b6=r14
mov r1,r9 ;get a1 add r0,r5,r1 ;a0 + a1 str [buf,#0],r0
sub r0,r5,r1 ;a0 - a1
ldr r1,#COS4_0 ;copy for later use, mov r14,1 ;
lsls r0,r0,#1 str [buf,#1],r0
;a2=r5,a1=r9,a4=r6,a5=r5,a6=r4,a7=r8,b0=b0,b1
mov r0,r10 ;get a3 add r0,r6 ;a3 + a2 mov r5,r0 ;store b0 str [buf,#2],r0
mov r0,r10 ;get a3 sub r0,r0,r5 ;a3 - a2
ldr r1,#COS4_0
lsls r0,r0,#1 str [buf,#2],r0 add r0,r6 str [buf,#3],r0
;a4=r6,a5=r5,a6=r7,a7=r8 add r0,r5,r6 mov r9,r0 ;b4
sub r0,r6,r7 ;a4 - a5 mov r1,r14
lsls r5,r0,#1 ;b5
mov r0,r7 add r0,r8 ;b6 mov r8,r0
sub r0,r7
mov r1,r14
lsls r6,r0,#1 ;b7
;b4=r9,b5=r5,b6=r8,b7=r6
str [buf,#7],r6 ;store b7
mov r0,r9 add r0,r8
str [buf,#4],r0 ;store b4
add r0,r5,r6 ;b5 + b7
str [buf,#5],r0 ;store b5
add r5,r8
str [buf,#6],r5 ;store b6To be clear, this isn't perfect working code but rather a 'sketch' to roughly work out how many cycles the DCT takes in an MP3 decoder. I haven't calculated the exact number of cycles but it IS impossible to use a conditional branch back to the top of the loop. The code branches around an unconditional loop. Now I haven't dealt with the loops but with good reason, since I use the stack-pointer as a general purpose register, the plan is to find the base-address of RAM (0x20000000 on the board I have) so that the address can be generated in 3 instructions.The code is mind-bending. I did alter the order of the C slightly so that no more than 8 variables have to be stored in registers at any given time. The reason for that is simple:r0-r4 - Used by MULSHIFT32 but can be used between usages of the macro.r5-r6 - lo-register accumulators/storage of variablesr7 - pointer to the source address of the datar8 - r13 storage for variablesr14 - pointer to DCT workspaceAs a good rule of thumb (ha ha), MANY pieces of code that are developed by academics presume a RISC processor with 16 x 32-bit registers, one of which is the stack-pointer. Well, ARM/Thumb has SP,LR & IP making the person converting to a generic RISC processor needs to use SP & LR. There IS a trick involved so that even if an exception/interrupt takes place, nothing is corrupted.But if anyone out there has a short-cut to finding the top 32-bits of a 64-bit multiply result, please let me know...
Many thanks Bastian. It doesn't seem to have a DAC but it DOES have DMA. It looks like I'm going to have to build a system to test the code but I am truly terrible at hardware so if anyone can or knows someone who can, I am prepared to buy a couple of systems. I think an Arduino Feather m0+ with an amplifier would work. The fact it uses MicroSD is also useful because I am presuming that DMAing an entire page (0.5 - 8Kb depending on card).I've gone through my sketch and found a few bugs but the important thing is the code excepting MULSHIFT32 macro is 105 cycles. OK, maybe alteration might get that down to 100, not less, I don't think. The KEY thing is that WITH the MULSHIFT32 it takes 309 cycles i.e. 204 cycles are the long multiplication.I'm still going to go forward with sketching the other pieces of code that take up most of the CPU time. I note that some companies state that they offer an MP3 decoder for the M0+ but when you ask, they all admit that they actually don't. I think they assume that simply compiling their C code will work. It will NOT. A 100% assembly language solution is, happily, my métier.I am now looking at the polyphase filter. My very FIRST decision is to deal with stereo channels in 2 passes. It's just possible to do the mono version in registers, but certainly not the stereo. While this code doesn't use MULSHIFT32, it does something worse i.e. MADD64. That means that a minimum of 6 of the lo registers will be needed and since r7 is likely needed as a pointer, that leaves 1 register to act as the accumulator. This is old-skool coding.I sent my oldest friend Joseph Yiu's book on the M0 & M0+ (everyone should have these), an Arduino Zero and a USB<--->MicroUSB cable. If SHE says it's possible, it's possible.But I am still trying to work out what is the best system to develop 100% assembly language. It's evidently so unusual that nobody seems to know!