! -----------------------------------------------------------
! twiddle_4bit_md_asm function for GenesisPlusDC emulator by Donald Haase (Quzar)
! 
!	This is optimized based on the most optimized gas output I could get
!
! Parameters:
! u8 *pixels, u16 *vtex
! r4 = *pixels
! r5 = *vtex	!after the first bit, it is moved to r8
!
! Some variable uses:
! r10 = x
! r13 = 1 !always set to 1
! r14 = 4
! -----------------------------------------------------------
	.text
	.balign 32
	.global	_twiddle_4bit_md_asm
	.type	_twiddle_4bit_md_asm, @function
_twiddle_4bit_md_asm:
	mov.l	r8,@-r15
	mov	#3,r6
	mov.l	r9,@-r15
	mov	r5,r8			!r8 is now vtex
	mov.l	r10,@-r15
	mov	#0,r5
	mov.l	r13,@-r15	!Since we use the number 1 very often, i'm going to set r13 to 1, as register->register moves are faster than imm->register
	mov	#0,r7
	mov.l	r14,@-r15
	mov #1, r13
	mov.l	r11,@-r15
	mov #4, r14			!Same with #1 and r13, but with #4
	mov.l	r12,@-r15
	!add	#-4,r15	
	
.L9:			!outer loop (y)
	mov	r5,r0	
	and	#2,r0
	mov	r13,r2	!this was moved here to improve pipelining and changed to r13 instead of #1
	add	r0,r0	!this is the shifting part of the x&2	
				!mov	#1,r2	 went here originally
	and	r5,r2
	mov	r7,r3
	or	r0,r2
	mov	r5,r0	
	and	r14,r0	!and	#4,r0
	mov	#0,r10		!move 0 into x
	shll2	r0
	add	r14,r3	!add	#4,r3
	or	r0,r2
.L8:
	mov	r10,r0
	and	#2,r0
	add	r0,r0
	mov	r13,r9	!mov	#1,r9
	and	r10,r9
	mov r13,r12	!mov	#1,r12
	or	r0,r9
	mov	r10,r0
	xor	r10,r12
	and	r14,r0	!and	#4,r0
	add	r4,r12
	shll2	r0
	or	r0,r9
	mov	r12,r0
	mov.b	@(r0,r7),r11
	add	r9,r9
	or	r2,r9
	extu.b	r11,r1
					!mov.l	r1,@r15
	mov	r1,r0
	shlr2	r1
	and	#15,r0
	shll8	r0
	shlr2	r1
	mov	r0,r11
	mov	r12,r0
	or	r1,r11
	mov.b	@(r0,r3),r1
	mov	#12,r12
	add	r13,r10	!add	#1,r10
	mov	r1,r0
	and	#15,r0
	shld	r12,r0
	cmp/gt	r6,r10
	or	r0,r11
	mov	r1,r0
	and	#240,r0
	or	r0,r11
	add	r9,r9
	mov	r9,r0
	bf.s	.L8
	mov.w	r11,@(r0,r8)
	add	r13,r5	!add	#1,r5
	cmp/gt	r6,r5
	bf.s	.L9
	add	#8,r7
	!add	#4,r15	
	mov.l	@r15+,r12
	mov.l	@r15+,r11
	mov.l	@r15+,r14
	mov.l	@r15+,r13
	mov.l	@r15+,r10
	mov.l	@r15+,r9
	rts	
	mov.l	@r15+,r8
	.size	_twiddle_4bit_md_asm, .-_twiddle_4bit_md_asm
	.ident	"GCC: (GNU) 3.4.2"
