/*
	Hardware accelerated renderer for Genesis Plus / DC
	(c) Stuart Dalton, 1st May 2005
*/

#include "shared.h"
#include <dc/pvr.h>
#include <dc/fmath.h>

/* PowerVR direct rendering state */
static pvr_dr_state_t dr_state;

/* The texture containing the cached tiles */
static pvr_ptr_t tile_tex;
static uint32 tile_tex_ptr;

#define TILE_BUFFERS 2
static pvr_ptr_t tile_tex_array[TILE_BUFFERS];
static uint32 tile_tex_ptr_array[TILE_BUFFERS];
static void *tile_tex_buf;
static uint32 tile_tex_cur;

int xpalette_base = 0;

extern int frames_skipped;

#if 0
static uint32 line_test[232];
#endif

/* Depths */
#define DEPTH_FAR_B	    10	/* Base depth for far background B */
#define DEPTH_FAR_A	    11	/* Base depth for far background A */
#define DEPTH_FAR_W	   101	/* Base depth for dodgy window emulation */
#define DEPTH_FAR_S	   100	/* Base depth for far sprites */
#define DEPTH_PRIORITY 100	/* Add to base for high priority */

/* Borders */
pvr_poly_hdr_t hdr_border;

#	define DELTAXY 16

/* SWRENDER START */

#ifdef ALIGN_LONG

/* Or change the names if you depend on these from elsewhere.. */
#undef READ_LONG
#undef WRITE_LONG

static __inline__ uint32 READ_LONG(void *address)
{
#ifdef LSB_FIRST  /* little endian version */
  return ( *((uint8 *)address) +
           (*((uint8 *)address+1) << 8)  +
           (*((uint8 *)address+2) << 16) +
           (*((uint8 *)address+3) << 24) );
#else             /* big endian version */
  return ( *((uint8 *)address+3) +
           (*((uint8 *)address+2) << 8)  +
           (*((uint8 *)address+1) << 16) +
           (*((uint8 *)address)   << 24) );
#endif  /* LSB_FIRST */
}

static __inline__ void WRITE_LONG(void *address, uint32 data)
{
#ifdef LSB_FIRST
  *((uint8 *)address) =    data;
  *((uint8 *)address+1) = (data >> 8);
  *((uint8 *)address+2) = (data >> 16);
  *((uint8 *)address+3) = (data >> 24);
#else
  *((uint8 *)address+3) =  data;
  *((uint8 *)address+2) = (data >> 8);
  *((uint8 *)address+1) = (data >> 16);
  *((uint8 *)address)   = (data >> 24);
#endif /* LSB_FIRST */
  return;
}

#endif  /* ALIGN_LONG */

#ifdef ALIGN_LONG

/* Draw a single 8-pixel column */
#define DRAW_COLUMN(ATTR, LINE) \
    for(inc=0;inc<2;inc++) \
    { \
      atex = atex_table[(ATTR >> 13) & 7]; \
      src = (uint32 *)&bg_pattern_cache[(ATTR & 0x1FFF) << 6 | (LINE)]; \
      if ((uint32)src & 3) \
      { \
        if ((uint32)dst & 3) \
        { \
          WRITE_LONG(dst++, READ_LONG(src++) | atex); \
          WRITE_LONG(dst++, READ_LONG(src++) | atex); \
        } \
        else \
        { \
          *dst++ = READ_LONG(src++) | atex; \
          *dst++ = READ_LONG(src++) | atex; \
        } \
      } \
      else \
      { \
        if ((uint32)dst & 3) \
        { \
          WRITE_LONG(dst++, (*src++ | atex)); \
          WRITE_LONG(dst++, (*src++ | atex)); \
        } \
        else \
        { \
          *dst++ = (*src++ | atex); \
          *dst++ = (*src++ | atex); \
        } \
      } \
      ATTR >>= 16; \
    }
#else

/* Draw a single 8-pixel column */
#define DRAW_COLUMN(ATTR, LINE) \
    atex = atex_table[(ATTR >> 13) & 7]; \
    src = (uint32 *)&bg_pattern_cache[(ATTR & 0x1FFF) << 6 | (LINE)]; \
    *dst++ = (*src++ | atex); \
    *dst++ = (*src++ | atex); \
    ATTR >>= 16; \
    atex = atex_table[(ATTR >> 13) & 7]; \
    src = (uint32 *)&bg_pattern_cache[(ATTR & 0x1FFF) << 6 | (LINE)]; \
    *dst++ = (*src++ | atex); \
    *dst++ = (*src++ | atex);

#endif /* ALIGN_LONG */


#ifdef ALIGN_LONG

/* Draw a single 16-pixel column */
#define DRAW_COLUMN_IM2(ATTR, LINE) \
    for(inc=0;inc<2;inc++) \
    { \
      atex = atex_table[(ATTR >> 13) & 7]; \
      offs = ((ATTR & 0x03FF) << 7 | (ATTR & 0x1800) << 6 | (LINE)) ^ ((ATTR & 0x1000) & 0x40); \
      src = (uint32 *)&bg_pattern_cache[offs]; \
      if ((uint32)src & 3) \
      { \
        if ((uint32)dst & 3) \
        { \
          WRITE_LONG(dst++, READ_LONG(src++) | atex); \
          WRITE_LONG(dst++, READ_LONG(src++) | atex); \
        } \
        else \
        { \
          *dst++ = READ_LONG(src++) | atex; \
          *dst++ = READ_LONG(src++) | atex; \
        } \
      } \
      else \
      { \
        if ((uint32)dst & 3) \
        { \
          WRITE_LONG(dst++, (*src++ | atex)); \
          WRITE_LONG(dst++, (*src++ | atex)); \
        } \
        else \
        { \
          *dst++ = (*src++ | atex); \
          *dst++ = (*src++ | atex); \
        } \
      } \
      ATTR >>= 16; \
    }

#else

/* Draw a single 16-pixel column */
#define DRAW_COLUMN_IM2(ATTR, LINE) \
    atex = atex_table[(ATTR >> 13) & 7]; \
    offs = (ATTR & 0x03FF) << 7 | (ATTR & 0x1800) << 6 | (LINE); \
    if(ATTR & 0x1000) offs ^= 0x40; \
    src = (uint32 *)&bg_pattern_cache[offs]; \
    *dst++ = (*src++ | atex); \
    *dst++ = (*src++ | atex); \
    ATTR >>= 16; \
    atex = atex_table[(ATTR >> 13) & 7]; \
    offs = (ATTR & 0x03FF) << 7 | (ATTR & 0x1800) << 6 | (LINE); \
    if(ATTR & 0x1000) offs ^= 0x40; \
    src = (uint32 *)&bg_pattern_cache[offs]; \
    *dst++ = (*src++ | atex); \
    *dst++ = (*src++ | atex);

#endif /* ALIGN_LONG */

/*
    gcc complains about this:
        *lb++ = table[(*lb << 8) |(*src++ | palette)];
    .. claiming the result on lb is undefined.
    So we manually advance lb and use constant offsets into the line buffer.
*/
#define DRAW_SPRITE_TILE \
    lb[0] = table[(lb[0] << 8) |(*src++ | palette)]; \
    lb[1] = table[(lb[1] << 8) |(*src++ | palette)]; \
    lb[2] = table[(lb[2] << 8) |(*src++ | palette)]; \
    lb[3] = table[(lb[3] << 8) |(*src++ | palette)]; \
    lb[4] = table[(lb[4] << 8) |(*src++ | palette)]; \
    lb[5] = table[(lb[5] << 8) |(*src++ | palette)]; \
    lb[6] = table[(lb[6] << 8) |(*src++ | palette)]; \
    lb[7] = table[(lb[7] << 8) |(*src++ | palette)]

/* Pixel creation macros, input is four bits each */

/* 5:6:5 RGB */
#define MAKE_PIXEL_16(r,g,b) ((r) << 12 | (g) << 7 | (b) << 1)

/* Clip data */
static clip_t clip[2];

/* Attribute expansion table */
static const uint32 atex_table[] = {
    0x00000000, 0x10101010, 0x20202020, 0x30303030,
    0x40404040, 0x50505050, 0x60606060, 0x70707070
};

/* Sprite name look-up table */
uint8 name_lut[0x400];

/* Sprite line buffer data */
uint8 object_index_count;

struct {
    uint16 ypos;
    uint16 xpos;
    uint16 attr;
    uint8 size;
    uint8 index;
}object_info[20];

/* Pixel look-up tables and table base address */
uint8 *lut[5];
uint8 *lut_base;

/* 16-bit pixel remapping data */
uint16 pixel_16[0x100];
uint16 pixel_16_lut[3][0x200];

/* Line buffers */
uint8 tmp_buf[0x400];                   /* Temporary buffer */
uint8 bg_buf[0x400];                    /* Merged background buffer */
uint8 nta_buf[0x400];                   /* Plane A / Window line buffer */
uint8 ntb_buf[0x400];                   /* Plane B line buffer */
uint8 obj_buf[0x400];                   /* Object layer line buffer */

/* SWRENDER END */

/*--------------------------------------------------------------------------*/
/* Init, reset, shutdown routines                                           */
/*--------------------------------------------------------------------------*/
int xrender_init(int xvdp)
{
	pvr_poly_cxt_t cxt;
	int bx, ax, i;

	/* Put the palettes in ARGB1555 mode */
	pvr_set_pal_format(PVR_PAL_ARGB1555);

	/* Allocate the tile cache  */
	tile_tex_buf = memalign(32, 0x10000);
	for(i = 0; i < TILE_BUFFERS; i++)
	{
		tile_tex_array[i] = pvr_mem_malloc(0x10000);
		tile_tex_ptr_array[i] = ((uint32)tile_tex_array[i] & 0x00fffff8) >> 3;
	}
	tile_tex_cur = 0;
	tile_tex = tile_tex_array[tile_tex_cur];
	tile_tex_ptr = tile_tex_ptr_array[tile_tex_cur];

	/* Precompile the border header */
	pvr_poly_cxt_col(&cxt, PVR_LIST_OP_POLY);
	cxt.gen.shading = PVR_SHADE_FLAT;
	pvr_poly_compile(&hdr_border, &cxt);

	xpalette_base = 0;

  /* SWRENDER START */

  /* Allocate and align pixel look-up tables */
  lut_base = (uint8 *)malloc((LUT_MAX * LUT_SIZE) + LUT_SIZE);
  lut[0] = (uint8 *)(((uint32)lut_base + LUT_SIZE) & ~(LUT_SIZE - 1));
  for(i = 1; i < LUT_MAX; i += 1)
  {
    lut[i] = lut[0] + (i * LUT_SIZE);
  }

  /* Make pixel look-up table data */
  for(bx = 0; bx < 0x100; bx += 1)
  for(ax = 0; ax < 0x100; ax += 1)
  {
    uint16 index = (bx << 8) | (ax);
    lut[0][index] = make_lut_bg(bx, ax);
    lut[1][index] = make_lut_obj(bx, ax);
    lut[2][index] = make_lut_bg_ste(bx, ax);
    lut[3][index] = make_lut_obj_ste(bx, ax);
    lut[4][index] = make_lut_bgobj_ste(bx, ax);
  }

  /* Make pixel data tables */
  for(i = 0; i < 0x200; i += 1)
  {
    int r, g, b;

    r = (i >> 6) & 7;
    g = (i >> 3) & 7;
    b = (i >> 0) & 7;

    pixel_16_lut[0][i] = MAKE_PIXEL_16(r,g,b);
    pixel_16_lut[1][i] = MAKE_PIXEL_16(r<<1,g<<1,b<<1);
    pixel_16_lut[2][i] = MAKE_PIXEL_16(r|8,g|8,b|8);
  }

  /* Set up color update function */
  if (xvdp)
    color_update = xcolor_update_16;
  else
    color_update = color_update_16;

  /* Make sprite name look-up table */
  make_name_lut();

  /* SWRENDER END */

	return 1;
}

void make_name_lut(void)
{
    int col, row;
    int vcol, vrow;
    int width, height;
    int flipx, flipy;
    int i, name;

#ifdef SQs
	sq_clr(name_lut, sizeof(name_lut));//Quzar
#else
    memset(name_lut, 0, sizeof(name_lut));
#endif

    for(i = 0; i < 0x400; i += 1)
    {
        vcol = col = i & 3;
        vrow = row = (i >> 2) & 3;
        height = (i >> 4) & 3;
        width = (i >> 6) & 3;
        flipx = (i >> 8) & 1;
        flipy = (i >> 9) & 1;

        if(flipx)
            vcol = (width - col);
        if(flipy)
            vrow = (height - row);

        name = vrow + (vcol * (height + 1));

        if((row > height) || col > width)
            name = -1;

        name_lut[i] = name;        
    }
}

void render_reset(void)
{
  /* SWRENDER START */

#ifdef SQs
  sq_clr(&clip, sizeof(clip));	//Quzar

  sq_clr(bg_buf, sizeof(bg_buf));
  sq_clr(tmp_buf, sizeof(tmp_buf));
  sq_clr(nta_buf, sizeof(nta_buf));
  sq_clr(ntb_buf, sizeof(ntb_buf));
  sq_clr(obj_buf, sizeof(obj_buf));

  sq_clr(&pixel_16, sizeof(pixel_16));
#else
  memset(&clip, 0, sizeof(clip));

  memset(bg_buf, 0, sizeof(bg_buf));
  memset(tmp_buf, 0, sizeof(tmp_buf));
  memset(nta_buf, 0, sizeof(nta_buf));
  memset(ntb_buf, 0, sizeof(ntb_buf));
  memset(obj_buf, 0, sizeof(obj_buf));

  memset(&pixel_16, 0, sizeof(pixel_16));
#endif

  /* SWRENDER END */
}

void render_shutdown(void)
{
	int i;

	// Free the tile cache
	free(tile_tex_buf);
	tile_tex_buf = NULL;
	for(i = 0; i < TILE_BUFFERS; i++)
	{
		pvr_mem_free(tile_tex_array[i]);
		tile_tex_ptr_array[i] = 0;
		tile_tex_array[i] = 0;
	}

  /* SWRENDER START */

  if(lut_base) free(lut_base);

  /* SWRENDER END */
}


/*--------------------------------------------------------------------------*/
/* Palette handling garbage                                                 */
/*--------------------------------------------------------------------------*/
void xrender_deal_with_palette()
{
	int i;
	uint16 src;
	uint32 dst;

	/* Switch to next colour palette */
	xpalette_base += 4;
	if(xpalette_base == 16)
		xpalette_base = 0;
	int colour_base = xpalette_base * 16;

	for(i = 0; i < 64; i++)
	{
		/* Do we bother cleaning this one? */
//		if(!color_dirty[i])
//			continue;
//		color_dirty[i] = 0;

		/* Decode the colours, and kick them over to the PVR */
		src = ((cram[i<<1])|(cram[(i<<1)+1]<<8));
		dst = ((src&0x007)<<2)|((src&0x038)<<4)|((src&0x1C0)<<6);
		if( (i & 0x0F) != 0 )
			dst |= 0x8000;

		pvr_set_pal_entry(i+colour_base, dst);
	}


	/* Colour palette clean - byebye! */
	is_color_dirty = 0;
}

#define TWIDTAB(x) ( (x&1)|((x&2)<<1)|((x&4)<<2) )
#define TWIDOUT(x, y) ( TWIDTAB((y)) | (TWIDTAB((x)) << 1) )

/* FIXME - This is enough to drive a perfectly sane person crazy ;-) */
/* TODO - Store queues for output */
/* TODO - Prefetch */
/* TODO - Convert to assembly */
/* TODO - Simplify some more */
/* TODO - Eliminate redundant TODOs */
void twiddle_4bit_md(uint8 *pixels, uint16 *vtex)
{
	int x, y, yv1, yv2, tty;

	for (y=0; y<4; y++)
	{
		yv1 = y<<3;
		yv2 = yv1+4;
    tty = TWIDTAB(y);
		for (x=0; x<4; x++)
		{
			vtex[tty | (TWIDTAB(x)<<1)] = ((pixels[yv1+(x^1)]&15)<<8)  |
                                    ((pixels[yv1+(x^1)]>>4)   )  | 
                                    ((pixels[yv2+(x^1)]&15)<<12) |
				                            ((pixels[yv2+(x^1)]&240));
		}
	}
}

void twiddle_4bit_md_asm(uint8 *pixels, uint16 *vtex);	//Quzar - The function I did saves maybe 10 cycles over the best gcc I could get

void xrender_copy_patterns()
{
	int i;
	for(i = 0; i < bg_list_index; i++)
	{
		int tile = bg_name_list[i];

		bg_name_dirty[tile] = 0;
		tile <<= 5;
		twiddle_4bit_md_asm((uint8 *)vram+tile, (uint16 *)((uint8 *)tile_tex_buf+tile));
	}
	bg_list_index = 0;
}

#define PT_CMD 0x84840008     // 10000100100001000000000000001000
#define PT_MODE1 0x92000000
                              // sss                              - PVR_TA_PM2_SRCBLEND_SHIFT
                              //    ddd                           - PVR_TA_PM2_DSTBLEND_SHIFT
                              //       s                          - PVR_TA_PM2_SRCENABLE_SHIFT
                              //        d                         - PVR_TA_PM2_DSTENABLE_SHIFT
                              //         ff                       - PVR_TA_PM2_FOG_SHIFT
                              //           c                      - PVR_TA_PM2_CLAMP_SHIFT
                              //            a                     - PVR_TA_PM2_ALPHA_SHIFT
                              //             a                    - PVR_TA_PM2_TXRALPHA_SHIFT
                              //              uv                  - PVR_TA_PM2_UVFLIP_SHIFT
                              //                uv                - PVR_TA_PM2_UVCLAMP_SHIFT
                              //                  fff             - PVR_TA_PM2_FILTER_SHIFT
                              //                     mmmm         - PVR_TA_PM2_MIPBIAS_SHIFT
                              //                         tt       - PVR_TA_PM2_TXRENV_SHIFT
                              //                           uuu    - PVR_TA_PM2_USIZE_SHIFT
                              //                              vvv - PVR_TA_PM2_VSIZE_SHIFT
#define PT_MODE2 0x949004C0   // 10010100100100000000010011000000
                              //                    1000000000000
                              //                     100000000000
#define PT_MODE3 0x28000000

/* Draws a single tile at the specified coordinates */
/* TODO - Speed this up, because it effectively IS the VDP emulator */
void xrender_draw_tile(float x, float y, float zbase, uint16 attr)
{
	pvr_poly_hdr_t *hdr;
	pvr_vertex_t *vert;
	//uint32 vertex_colour = 0xFFFFFFFF;
	float tc_left, tc_right, tc_top, tc_bum;

  //if (zbase > 11.0)
  //  vertex_colour = 0xFFFFFFFF;
  //else
  //  vertex_colour = 0xCFFFFFFF;

	//x *= 2;
	//y *= 2;

	/* Construct polygon header
	   This is done manually to avoid calling pvr_poly_compile every frame
	   and so we don't have to cache 32768 of the sodding things */
	hdr = (pvr_poly_hdr_t *)pvr_dr_target(dr_state);
	hdr->cmd = PT_CMD;
	hdr->mode1 = PT_MODE1;

	/* H and V flipping */
	/* Naturally, the flags are in the opposite order to the way we want */
	hdr->mode2 = PT_MODE2;
	//hdr->mode2 |= ((uint32)(attr & 0x1000) << 5); // V
	//hdr->mode2 |= ((uint32)(attr & 0x0800) << 7); // H

	/* Colour palette */
	/* TODO - Eliminate some bit shifting (((attr >> 13) & 0x03) << 21) */
	hdr->mode3 = PT_MODE3 | PVR_TXRFMT_4BPP_PAL(((attr >> 13) & 0x03) + xpalette_base);

	/* Tile base address */
	hdr->mode3 |= (tile_tex_ptr + ((attr & 0x7ff) << 2));

	/* Fill out junk data, and send to PVR */
	pvr_dr_commit(hdr);

	/* Check priority bit, change paramaters around if necessary */
	if(attr & 0x8000)
	{
		zbase += DEPTH_PRIORITY;
		//vertex_colour = 0xFFFFFFFF;
	}

	/* Check for horizontal and vertical flipping */
	if(attr & 0x1000)
  {
    tc_top = 1.0f;
    tc_bum = 0.0f;
  }
  else
  {
    tc_top = 0.0f;
		tc_bum = 1.0f;
	}

	if(attr & 0x0800)
	{
    tc_left = 1.0f;
    tc_right = 0.0f;
  }
	else
	{
    tc_left = 0.0f;
		tc_right = 1.0f;
  }

	/* Send the upper-left vertex */
	vert = pvr_dr_target(dr_state);
	vert->flags = PVR_CMD_VERTEX;
	vert->x = x;
	vert->y = y;
	vert->z = zbase;
	vert->u = tc_left;
	vert->v = tc_top;
	vert->argb = 0xFFFFFFFF;
	vert->oargb = 0;
	pvr_dr_commit(vert);

	/* Send the upper-right vertex */
	vert = pvr_dr_target(dr_state);
	vert->flags = PVR_CMD_VERTEX;
	vert->x = x+DELTAXY;
	vert->y = y;
	vert->z = zbase;
	vert->u = tc_right;
	vert->v = tc_top;
	vert->argb = 0xFFFFFFFF;
	vert->oargb = 0;
	pvr_dr_commit(vert);

	/* Send the lower-left vertex */
	vert = pvr_dr_target(dr_state);
	vert->y = y+DELTAXY;
	vert->v = tc_bum;
	pvr_dr_commit(vert);

	/* Send the lower-right vertex */
	vert = pvr_dr_target(dr_state);
	vert->flags = PVR_CMD_VERTEX_EOL;
	vert->y = y+DELTAXY;
	vert->v = tc_bum;
	pvr_dr_commit(vert);

	/* Nothing more to see here ;-) */
}

/* Main sprite rendering loop */
void xrender_drawsprites()
{
	int count;
  int link = 0;
	int total = 64 + ((reg[12] & 1) << 4);
	uint8 *q; // wasp

	for(count = 0; count < total; count ++)
	{
		int width, height, ch;
		int x, y, dx, dy, cy;
		uint16 attr;

		/* Set up pointers to the internal SAT and VRAM */
		q = &sat[link];
		//p = &vram[satb + link]; //wasp

		/* Grab coordinates and attributes */
		y = *(uint16 *)&q[0] - 120;
		x = *(uint16 *)&q[6] - 128; //wasp

		attr = *(uint16 *)&q[4]; // wasp
		height = q[3] & 3;
		width = (q[3] & 0x0C) >> 2;

    ch=y+(height<<3)-1;
    cy=attr & 0x7ff;

    #if 0
    if (x==-128 && (y > 8 && y < 232))
    {
      printf("%d to %d == %d\n", y, ch, cy);

      for(;ch>=y;ch--)
        line_test[ch]=cy;
    }
    else
    {
      for(;ch>=y;ch--)
        if (line_test[ch] > cy)
        {
          printf("skipped %d because %d < %d\n", cy, ch, line_test[ch]);
          goto SKIP_SPRITE;
        }
    }
    #endif
    
    /* 32-cell mode */
    if(!(reg[0x0C] & 0x01))
      x += 32;

    /* Work out flipping stuff */
    if(attr & 0x0800)
    {
      /* Draw from right to left */
      x += width<<3;
      dx = -16;
    }
    else
      /* Draw from left to right */
      dx = 16;

    if(attr & 0x1000)
    {
      /* Draw from bottom to top */
      y += height<<3;
      dy = -16;
    }
    else
      /* Draw from top to bottom */
      dy = 16;

    //printf("%d,%d (%d,%d)\n",x,y,width,height);

    x *= 2;
    y *= 2;

    /* Draw tile */
    width++;
    height++;
    while(width--)
    {
      cy = y;
      ch = height;

      while(ch--)
      {
        xrender_draw_tile(x, cy, DEPTH_FAR_S - count, attr++);
        cy += dy;
      }
      x += dx;
    }

    #if 0
    SKIP_SPRITE:
    #endif
    
    // Follow the link field
    if (!(link = (q[2] & 0x7F)))
      break;

    link <<= 3;
	}
}

void get_hscroll(int line, uint16 *scrolla, uint16 *scrollb)
{
    switch(reg[11] & 3)
    {
        case 0: /* Full-screen */
            *scrolla = *(uint16 *)&vram[hscb + 0];
            *scrollb = *(uint16 *)&vram[hscb + 2];
            break;

        case 1: /* First 8 lines */
            *scrolla = *(uint16 *)&vram[hscb + ((line & 7) << 2) + 0];
            *scrollb = *(uint16 *)&vram[hscb + ((line & 7) << 2) + 2];
            break;

        case 2: /* Every 8 lines */
            *scrolla = *(uint16 *)&vram[hscb + ((line & ~7) << 2) + 0];
            *scrollb = *(uint16 *)&vram[hscb + ((line & ~7) << 2) + 2];
            break;

        case 3: /* Every line */
            *scrolla = *(uint16 *)&vram[hscb + (line << 2) + 0];
            *scrollb = *(uint16 *)&vram[hscb + (line << 2) + 2];
            break;
    }

    *scrolla &= 0x03FF;
    *scrollb &= 0x03FF;
}

/* FIXME - this ignores almost all video modes, windowing, half the scrolling
   paramaters, and god knows what else... */
void xrender_ntx(int which)
{
	uint16 table;
	int cell_y, pixel_y;
	float depth;
	int ofs_x, end_x;

	/* Name table base address */
	table = (which) ? ntbb : ntab;
	depth = (which) ? DEPTH_FAR_B : DEPTH_FAR_A;

	{
		/* Vertical scrolling paramaters */
		uint16 y_scroll;
    		int vsr_shift = (which) ? 16 : 0;
		uint32 *vs = (uint32 *)&vsram[0];
		y_scroll = (vs[0] >> vsr_shift);

		/* Set cell and pixel coordinates */
		cell_y = ((y_scroll & 0x3F8) >> 3) & playfield_row_mask;
		pixel_y = -(y_scroll & 0x07);
	}

	/* 40- or 32-cell mode? */
	if(reg[0x0C] & 1)
	{
		ofs_x = 0;
		end_x = 320;
	}
	else
	{
		ofs_x = 32;
		end_x = 288;
	}

	while(pixel_y < 224)
	{
		int cell_x, pixel_x;
		uint16 *ptr;

		{
			/* Horizontal scrolling paramaters */
			uint16 x_scroll;
			uint16 xascroll, xbscroll;
			if(pixel_y < 0)
				get_hscroll(0, &xascroll, &xbscroll);
			else
				get_hscroll(pixel_y, &xascroll, &xbscroll);
			x_scroll = (which) ? -xbscroll : -xascroll;

			/* Set cell and pixel coordinates */
			cell_x = ((x_scroll & 0x3F8) >> 3) & playfield_col_mask;
			pixel_x = -(x_scroll & 0x07) + ofs_x;
		}

		ptr = (uint16 *)&vram[table + (cell_y << playfield_shift)];
		while(pixel_x < end_x)
		{
			xrender_draw_tile(pixel_x*2, (pixel_y + 8)*2, depth, ptr[cell_x]);
			cell_x = (cell_x + 1) & playfield_col_mask;
			pixel_x += 8;
		}

		cell_y = (cell_y + 1) & playfield_row_mask;
		pixel_y += 8;
	}
}

/* FIXME: This overlaps plane A, but doesn't overwrite it... */
void xrender_ntw()
{
	uint16 *attr = (uint16 *)&vram[ntwb];

	/* Window positions and flipping */
	int hp = (reg[17] & 0x1F);
	int hf = (reg[17] >> 7) & 1;
	int vp = (reg[18] & 0x1F);
	int vf = (reg[18] >> 7) & 1;

	/* Display size  */
	int sw = 16 + ((reg[12] & 1) << 2);
	int pitch = 32 + ((reg[12] & 1) << 5);

	/* Number of rows to draw, and pixel y position */
	float pixel_y = 8.0f;
	int rows = 0;

	while(rows < 28)
	{
		int n_cols;
		float pixel_x;
		uint16 *battr;

		if(vf == (rows >= vp))
		{
			/* Window takes up entire line */
			n_cols = sw;
			pixel_x = 0;
			battr = attr;
		}
		else
		{
			/* Window does not take entire line */
			if(hf)
			{
				n_cols = sw - hp;
				pixel_x = 8.0 * hp;
				battr = attr + hp;
			}
			else
			{
				n_cols = hp;
				pixel_x = 0;
				battr = attr;
			}
		}

		/* Fix for 32-cell mode */
		if(sw == 16)
			pixel_x = 32;

		while(n_cols--)
		{
			xrender_draw_tile(pixel_x*2, pixel_y*2, DEPTH_FAR_W, *battr++);
			pixel_x += 8.0f;
			xrender_draw_tile(pixel_x*2, pixel_y*2, DEPTH_FAR_W, *battr++);
			pixel_x += 8.0f;
		}

		/* Next row */
		pixel_y += 8.0f;
		rows++;
		attr += pitch;
	}
}

/*--------------------------------------------------------------------------*/
/* Main frame rendering function                                            */
/*--------------------------------------------------------------------------*/
static void draw_rect(float x1, float y1, float x2, float y2)
{
	pvr_vertex_t vert;
	pvr_prim(&hdr_border, sizeof(hdr_border));

	vert.flags = PVR_CMD_VERTEX;
	vert.x = x1;
	vert.y = y1;
	vert.z = 250;
	vert.argb = 0;
	vert.oargb = 0;
	pvr_prim(&vert, sizeof(vert));

	vert.x = x2;
	pvr_prim(&vert, sizeof(vert));

	vert.x = x1;
	vert.y = y2;
	pvr_prim(&vert, sizeof(vert));

	vert.flags = PVR_CMD_VERTEX_EOL;
	vert.x = x2;
	pvr_prim(&vert, sizeof(vert));
}

void xrender_drawframe_op()
{
	pvr_list_begin(PVR_LIST_OP_POLY);

	draw_rect(0, 0, 640, 16);
	draw_rect(0, 464, 640, 480);

	if(!(reg[0x0C] & 0x01))
	{
		draw_rect(0, 0, 64, 480);
		draw_rect(576, 0, 640, 480);
	}

	pvr_list_finish();
}

void xrender_drawframe()
{
	/* Deal with changes to the colour palette */
	if(is_color_dirty)
		xrender_deal_with_palette();

  #if 0
	#ifdef SQs
		sq_clr(line_test, sizeof(line_test));	//Quzar
	#else
		memset(line_test, 0, sizeof(line_test));	//
	#endif //SQs  
  #endif
  
	/* Copy pattern data */
	if(bg_list_index)
		xrender_copy_patterns();

	/* Copy tile data to VRAM */
	/* TODO - Get DMA working without corruption */
//	pvr_txr_load_dma(tile_tex_buf, tile_tex, 0x10000, 0, NULL, 0);	
	//Quzar- if you just make the buffer/texture larger by 256bits then you could ignore the corruption and chop off the last bit of the buffer
	

	/* Change the background colour */
	{
		int i;
		uint16 cval;
		uint8 r, g, b;
		i = reg[0x07] & 0x3F;
		cval = ((cram[i<<1])|(cram[(i<<1)+1]<<8));
		r = ((cval & 0x1C0) >> 1);
		g = ((cval & 0x038) << 2);
		b = ((cval & 0x007) << 5);
		r |= (r >> 4);
		g |= (g >> 4);
		b |= (b >> 4);
		pvr_set_bg_color(r/256.0f, g/256.0f, b/256.0f);
	}

	/* Punch-through display list - almost everything goes here */
	pvr_list_begin(PVR_LIST_PT_POLY);

	/* Set up the direct rendering state */
	pvr_dr_init(dr_state);

	/* Only bother if the display is enabled */
	if(reg[0x01] & 0x40)
	{
		/* Draw sprites, plane A, plane B, window */
    xrender_drawsprites();
		xrender_ntx(0);
		xrender_ntx(1);
		xrender_ntw();
	}

	/* All done for the PT display list */
	pvr_list_finish();

#define break_size 0x6000		//Quzar - Well the idea is to split the amount and do some of it via dma and the rest with
	dcache_flush_range(tile_tex_buf, break_size);
	pvr_txr_load_dma(tile_tex_buf, tile_tex, break_size, 0, NULL, 0);	
	// TEMP - Store queue copy	
	sq_cpy(tile_tex+break_size, tile_tex_buf+break_size, 0x10000-break_size);
	//pvr_txr_load_dma(tile_tex_buf, tile_tex, 0x10000, 1, NULL, 0);	//Quzar - Blocking DMA works fine	

	/* Change tile cache buffers */
	tile_tex_cur++;
	if(tile_tex_cur == TILE_BUFFERS)
		tile_tex_cur = 0;
	tile_tex = tile_tex_array[tile_tex_cur];
	tile_tex_ptr = tile_tex_ptr_array[tile_tex_cur];
}

void xrender_endframe()
{
}

/* SWRENDER_START */

/*--------------------------------------------------------------------------*/
/* Line render function                                                     */
/*--------------------------------------------------------------------------*/

void render_line(int line)
{
  uint8 *lb = tmp_buf;
  int width = (reg[12] & 1) ? 320 : 256;

  if (!(frames_skipped & 1))
    return;

  if((reg[1] & 0x40) == 0x00)
  {
    /* Use the overscan color to clear the screen */
#ifdef SQs
	sq_set(&lb[0x20], 0x40 | border, bitmap.viewport.w); //Quzar - This should be faster
#else
    memset(&lb[0x20], 0x40 | border, bitmap.viewport.w);	
#endif
  }
  else
  {
    uint16 xascroll;
    uint16 xbscroll;

    void (*render_obj_func)(int line, uint8 *buf, uint8 *table) = render_obj;
    void (*render_ntw_func)(int line, uint8 *buf) = render_ntw;


    update_bg_pattern_cache();
    window_clip(line);
    get_hscroll(line, &xascroll, &xbscroll);

    if(im2_flag)
    {
      render_ntx_im2(ntab, line, lb, xascroll);
      render_ntx_im2(ntbb, line, ntb_buf, xbscroll);

      render_ntw_func = render_ntw_im2;
      render_obj_func = render_obj_im2;
    }
    else if(reg[0x0B] & 4)
    {
      render_ntx_vs(ntab, line, nta_buf, xascroll);
      render_ntx_vs(ntbb, line, ntb_buf, xbscroll);
    }
    else
    {
      render_ntx(ntab, line, nta_buf, xascroll);
      render_ntx(ntbb, line, ntb_buf, xbscroll);
    }

    render_ntw_func(line, nta_buf);

    if(reg[12] & 8)
    {
      merge(&nta_buf[0x20], &ntb_buf[0x20], &bg_buf[0x20], lut[2], width);

#ifdef SQs1
	  sq_clr(&obj_buf[0x20], width);	//Quzar - SQs	
#else
      memset(&obj_buf[0x20], 0, width);
#endif

      render_obj_func(line, obj_buf, lut[3]);

      merge(&obj_buf[0x20], &bg_buf[0x20], &lb[0x20], lut[4], width);
    }
    else
    {
      merge(&nta_buf[0x20], &ntb_buf[0x20], &lb[0x20], lut[0], width);

      render_obj_func(line, lb, lut[1]);
    }
  }

  if(reg[0] & 0x20)
  {
#ifdef SQs
    sq_set(&lb[0x20], 0x40 | border, 0x08); //Quzar- whenever possible use sq as opposed to standard mem*
#else
	memset(&lb[0x20], 0x40 | border, 0x08);	
#endif
  }

  remap_16(lb+0x20, (uint16 *)&bitmap.data[(line << 10) + 0x40], pixel_16, width);
}

/*--------------------------------------------------------------------------*/
/* Window rendering                                                         */
/*--------------------------------------------------------------------------*/

void render_ntw(int line, uint8 *buf)
{
  int column, v_line, width;
  uint32 *nt, *src, *dst, atex, atbuf, inc;

  v_line = (line & 7) << 3;
  width = 6 + (reg[12] & 1);

  nt = (uint32 *)&vram[ntwb | ((line >> 3) << width)];
  dst = (uint32 *)&buf[0x20 + (clip[1].left << 4)];

  for(column = clip[1].left; column < clip[1].right; column += 1)
  {
    atbuf = nt[column];
    DRAW_COLUMN(atbuf, v_line)
  }
}

void render_ntw_im2(int line, uint8 *buf)
{
  int column, v_line, width;
  uint32 *nt, *src, *dst, atex, atbuf, offs, inc;

  v_line = ((line & 7) << 1 | ((status >> 4) & 1)) << 3;
  width = 6 + (reg[12] & 1);

  nt = (uint32 *)&vram[ntwb | ((line >> 3) << width)];
  dst = (uint32 *)&buf[0x20 + (clip[1].left << 4)];

  for(column = clip[1].left; column < clip[1].right; column += 1)
  {
    atbuf = nt[column];
    DRAW_COLUMN_IM2(atbuf, v_line)
  }
}

/*--------------------------------------------------------------------------*/
/* Background plane rendering                                               */
/*--------------------------------------------------------------------------*/

void render_ntx(uint16 table, int line, uint8 *buf, int xscroll)
{
  int column;
  int start, end;
  int index;
  int shift;
  int nametable_row_mask = (playfield_col_mask >> 1);
  int v_line;
  uint32 atex, atbuf, *src, *dst, inc;
  int y_scroll;
  uint32 *nt;
  int vsr_shift = 0;
  uint32 *vs;

  shift = (xscroll & 0x0F);
  index = ((playfield_col_mask + 1) >> 1) - ((xscroll >> 4) & nametable_row_mask);

  if (table == ntbb)
  {
    start = 0;
    end = 16 + ((reg[0x0C] & 1) << 2);
    vsr_shift = 16;
  }
  else
  {
    // Looks correct if clip[0].left has 1 subtracted
    // Otherwise window has gap between endpoint and where the first normal
    // nta column starts

    if(clip[0].enable == 0) return;
    start = clip[0].left;
    end = clip[0].right;
    index = (index + clip[0].left) & nametable_row_mask;
  }

  vs = (uint32 *)&vsram[0];

  y_scroll = (vs[0] >> vsr_shift) & 0xFFFF;
  y_scroll = (line + (y_scroll & 0x3FF)) & playfield_row_mask;
  v_line = (y_scroll & 7) << 3;
  nt = (uint32 *)&vram[table + (((y_scroll >> 3) << playfield_shift) & y_mask)];

  if(shift)
  {
    dst = (uint32 *)&buf[0x20-(0x10-shift)];
    atbuf = nt[(index-1) & nametable_row_mask];
    DRAW_COLUMN(atbuf, v_line)
  }
  buf = (buf + 0x20 + shift);
  dst = (uint32 *)&buf[start<<4];

  for(column = start; column < end; column += 1, index += 1)
  {
    atbuf = nt[index & nametable_row_mask];
    DRAW_COLUMN(atbuf, v_line)
  }
}


void render_ntx_im2(uint16 table, int line, uint8 *buf, int xscroll)
{
  int column;
  int start, end;
  int index;
  int shift;
  int nametable_row_mask = (playfield_col_mask >> 1);
  int v_line;
  uint32 atex, atbuf, *src, *dst, inc;
  int y_scroll;
  uint32 *nt;
  int vsr_shift = 0;
  uint32 *vs;
  uint32 offs;

  shift = (xscroll & 0x0F);
  index = ((playfield_col_mask + 1) >> 1) - ((xscroll >> 4) & nametable_row_mask);

  if(table == ntbb)
  {
    start = 0;
    end = 16 + ((reg[0x0C] & 1) << 2);
    vsr_shift = 16;
  }
  else
  {
    if(clip[0].enable == 0) return;
    start = clip[0].left;
    end = clip[0].right;
    index = (index + clip[0].left) & nametable_row_mask;
  }

  vs = (uint32 *)&vsram[0];

  y_scroll = (vs[0] >> vsr_shift) & 0xFFFF;
  y_scroll = (line + ((y_scroll >> 1) & 0x3FF)) & playfield_row_mask;
  v_line = (((y_scroll & 7) << 1) | ((status >> 4) & 1)) << 3;
  nt = (uint32 *)&vram[table + (((y_scroll >> 3) << playfield_shift) & y_mask)];

  if(shift)
  {
    dst = (uint32 *)&buf[0x20-(0x10-shift)];
    atbuf = nt[(index-1) & nametable_row_mask];
    DRAW_COLUMN_IM2(atbuf, v_line)
  }
  buf = (buf + 0x20 + shift);
  dst = (uint32 *)&buf[start<<4];

  for(column = start; column < end; column += 1, index += 1)
  {
    atbuf = nt[index & nametable_row_mask];
    DRAW_COLUMN_IM2(atbuf, v_line)
  }
}


void render_ntx_vs(uint16 table, int line, uint8 *buf, int xscroll)
{
  int column;
  int start, end;
  int index;
  int shift;
  int nametable_row_mask = (playfield_col_mask >> 1);
  int v_line;
  uint32 atex, atbuf, *src, *dst, inc;
  int y_scroll;
  uint32 *nt;
  int vsr_shift = 0;
  uint32 *vs;

  shift = (xscroll & 0x0F);
  index = ((playfield_col_mask + 1) >> 1) - ((xscroll >> 4) & nametable_row_mask);

  if (table == ntbb)
  {
    start = 0;
    end = 16 + ((reg[0x0C] & 1) << 2);
    vsr_shift = 16;
  }
  else
  {
    if(clip[0].enable == 0) return;
    start = clip[0].left;
    end = clip[0].right;
    index = (index + clip[0].left) & nametable_row_mask;
  }

  vs = (uint32 *)&vsram[0];
  end = (reg[0x0C] & 1) ? 20 : 16;

  if(shift)
  {
    dst = (uint32 *)&buf[0x20-(0x10-shift)];
    y_scroll = (line & playfield_row_mask);
    v_line = (y_scroll & 7) << 3;
    nt = (uint32 *)&vram[table + (((y_scroll >> 3) << playfield_shift) & y_mask)];
    atbuf = nt[(index-1) & nametable_row_mask];
    DRAW_COLUMN(atbuf, v_line)
  }

  buf = (buf + 0x20 + shift);
  dst = (uint32 *)&buf[start << 4];

  for(column = start; column < end; column += 1, index += 1)
  {
    y_scroll = (vs[column] >> vsr_shift) & 0xFFFF;
    y_scroll = (line + (y_scroll & 0x3FF)) & playfield_row_mask;
    v_line = (y_scroll & 7) << 3;
    nt = (uint32 *)&vram[table + (((y_scroll >> 3) << playfield_shift) & y_mask)];
    atbuf = nt[index & nametable_row_mask];
    DRAW_COLUMN(atbuf, v_line)
  }
}

/*--------------------------------------------------------------------------*/
/* Helper functions (cache update, hscroll, window clip)                    */
/*--------------------------------------------------------------------------*/

void update_bg_pattern_cache(void)
{
    int i;
    uint8 x, y, c;
    uint16 name;

    if(!bg_list_index) return;

    for(i = 0; i < bg_list_index; i += 1)
    {
        name = bg_name_list[i];
        bg_name_list[i] = 0;

        for(y = 0; y < 8; y += 1)
        {
            if(bg_name_dirty[name] & (1 << y))
            {
                uint8 *dst = &bg_pattern_cache[name << 6];
                uint32 bp = *(uint32 *)&vram[(name << 5) | (y << 2)];

                for(x = 0; x < 8; x += 1)
                {
                    c = (bp >> ((x ^ 3) << 2)) & 0x0F;
                    dst[0x00000 | (y << 3) | (x)] = (c);
                    dst[0x20000 | (y << 3) | (x ^ 7)] = (c);
                    dst[0x40000 | ((y ^ 7) << 3) | (x)] = (c);
                    dst[0x60000 | ((y ^ 7) << 3) | (x ^ 7)] = (c);
                }
            }
        }
        bg_name_dirty[name] = 0;
    }
    bg_list_index = 0;
}

void window_clip(int line)
{
  int vp = (reg[18] & 0x1F) << 3;
  int vf = (reg[18] >> 7) & 1;

  /* Display size  */
  int sw = 16 + ((reg[12] & 1) << 2);

  /* Check if line falls within window range */
  if(!(vf ^ (line >= vp)))
  {
    clip[0].enable = clip[0].left = clip[0].right = clip[1].left = 0;
    /* Window takes up entire line */
    clip[1].right = sw;
    clip[1].enable = 1;
  }
  else
  {
    /* Window size and invert flags */
    int hp = (reg[17] & 0x1F);
    int a  = (reg[17] >> 7) & 1;
    int w  = a ^ 1;

    if(hp > sw)
    {
      clip[a].enable = clip[a].left = clip[a].right = clip[w].left = 0;
      /* Plane W takes up entire line */
      clip[w].right = sw;
      clip[w].enable = 1;
    }
    else if (hp)
    {
      /* Window takes left side, Plane A takes right side */
      clip[w].left = 0;
      clip[w].right = hp;
      clip[a].left = hp;
      clip[a].right = sw;
      clip[0].enable = clip[1].enable = 1;
    }
    else
    {
      clip[w].enable = clip[w].left = clip[w].right = clip[a].left = 0;
      /* Plane A takes up entire line */
      clip[a].right = sw;
      clip[a].enable = 1;
    }
  }
}



/*--------------------------------------------------------------------------*/
/* Look-up table functions                                                  */
/*--------------------------------------------------------------------------*/

/* Input (bx):  d5-d0=color, d6=priority, d7=unused */
/* Input (ax):  d5-d0=color, d6=priority, d7=unused */
/* Output:      d5-d0=color, d6=priority, d7=unused */
int make_lut_bg(int bx, int ax)
{
    int bf, bp, b;
    int af, ap, a;
    int x = 0;
    int c;

    bf = (bx & 0x7F);
    bp = (bx >> 6) & 1;
    b  = (bx & 0x0F);
    
    af = (ax & 0x7F);   
    ap = (ax >> 6) & 1;
    a  = (ax & 0x0F);

    c = (ap ? (a ? af : (b ? bf : x)) : \
        (bp ? (b ? bf : (a ? af : x)) : \
        (     (a ? af : (b ? bf : x)) )));

    /* Strip palette bits from transparent pixels */
    if((c & 0x0F) == 0x00) c = (c & 0xC0);

    return (c);
}


/* Input (bx):  d5-d0=color, d6=priority, d7=sprite pixel marker */
/* Input (sx):  d5-d0=color, d6=priority, d7=unused */
/* Output:      d5-d0=color, d6=zero, d7=sprite pixel marker */
int make_lut_obj(int bx, int sx)
{
    int bf, bp, bs, b;
    int sf, sp, s;
    int c;

    bf = (bx & 0x3F);
    bs = (bx >> 7) & 1;
    bp = (bx >> 6) & 1;
    b  = (bx & 0x0F);
    
    sf = (sx & 0x3F);
    sp = (sx >> 6) & 1;
    s  = (sx & 0x0F);

    if(s == 0) return bx;

    if(bs)
    {
        c = bf;
    }
    else
    {
        c = (sp ? (s ? sf : bf)  : \
            (bp ? (b ? bf : (s ? sf : bf)) : \
                  (s ? sf : bf) ));
    }

    /* Strip palette bits from transparent pixels */
    if((c & 0x0F) == 0x00) c = (c & 0xC0);

    return (c | 0x80);
}


/* Input (bx):  d5-d0=color, d6=priority, d7=unused */
/* Input (sx):  d5-d0=color, d6=priority, d7=unused */
/* Output:      d5-d0=color, d6=priority, d7=intensity select (half/normal) */
int make_lut_bg_ste(int bx, int ax)
{
    int bf, bp, b;
    int af, ap, a;
    int gi;
    int x = 0;
    int c;

    bf = (bx & 0x7F);
    bp = (bx >> 6) & 1;
    b  = (bx & 0x0F);
    
    af = (ax & 0x7F);   
    ap = (ax >> 6) & 1;
    a  = (ax & 0x0F);

    gi = (ap | bp) ? 0x80 : 0x00;

    c = (ap ? (a ? af  : (b ? bf  : x  )) : \
        (bp ? (b ? bf  : (a ? af  : x  )) : \
        (     (a ? af : (b ? bf : x)) )));

    c |= gi;

    /* Strip palette bits from transparent pixels */
    if((c & 0x0F) == 0x00) c = (c & 0xC0);

    return (c);
}


/* Input (bx):  d5-d0=color, d6=priority, d7=sprite pixel marker */
/* Input (sx):  d5-d0=color, d6=priority, d7=unused */
/* Output:      d5-d0=color, d6=priority, d7=sprite pixel marker */
int make_lut_obj_ste(int bx, int sx)
{
    int bf, bs;
    int sf;
    int c;

    bf = (bx & 0x7F);   
    bs = (bx >> 7) & 1; 
    sf = (sx & 0x7F);

    if((sx & 0x0F) == 0) return bx;

    c = (bs) ? bf : sf;

    /* Strip palette bits from transparent pixels */
    if((c & 0x0F) == 0x00) c = (c & 0xC0);

    return (c | 0x80);
}


/* Input (bx):  d5-d0=color, d6=priority, d7=intensity (half/normal) */
/* Input (sx):  d5-d0=color, d6=priority, d7=sprite marker */
/* Output:      d5-d0=color, d6=intensity (half/normal), d7=(double/invalid) */
int make_lut_bgobj_ste(int bx, int sx)
{
    int c;

    int bf = (bx & 0x3F);
    int bp = (bx >> 6) & 1;
    int bi = (bx & 0x80) ? 0x40 : 0x00;
    int b  = (bx & 0x0F);

    int sf = (sx & 0x3F);
    int sp = (sx >> 6) & 1;
    int si = (sx & 0x40) | (bi & 0x40);
    int s  = (sx & 0x0F);

    if(bi & 0x40) si |= 0x40;

    if(sp)
    {
        if(s)
        {            
            if((sf & 0x3E) == 0x3E)
            {
                if(sf & 1)
                {
                    c = (bf | 0x00);
                }
                else
                {
                    c = (bx & 0x80) ? (bf | 0x80) : (bf | 0x40);
                }
            }
            else
            {
                if(sf == 0x0E || sf == 0x1E || sf == 0x2E)
                {
                    c = (sf | 0x40);
                }
                else
                {
                    c = (sf | si);
                }
            }
        }
        else
        {
            c = (bf | bi);
        }
    }
    else
    {
        if(bp)
        {
            if(b)
            {
                c = (bf | bi);
            }
            else
            {
                if(s)
                {
                    if((sf & 0x3E) == 0x3E)
                    {
                        if(sf & 1)
                        {
                            c = (bf | 0x00);
                        }
                        else
                        {
                            c = (bx & 0x80) ? (bf | 0x80) : (bf | 0x40);
                        }
                    }
                    else
                    {
                        if(sf == 0x0E || sf == 0x1E || sf == 0x2E)
                        {
                            c = (sf | 0x40);
                        }
                        else
                        {
                            c = (sf | si);
                        }
                    }
                }
                else
                {
                    c = (bf | bi);
                }
            }
        }
        else
        {
            if(s)
            {
                if((sf & 0x3E) == 0x3E)
                {
                    if(sf & 1)
                    {
                        c = (bf | 0x00);
                    }
                    else
                    {
                        c = (bx & 0x80) ? (bf | 0x80) : (bf | 0x40);
                    }
                }
                else
                {
                    if(sf == 0x0E || sf == 0x1E || sf == 0x2E)
                    {
                        c = (sf | 0x40);
                    }
                    else
                    {
                        c = (sf | si);
                    }
                }
            }
            else
            {                    
                c = (bf | bi);
            }
        }
    }

    if((c & 0x0f) == 0x00) c = (c & 0xC0);

    return (c);
}

/*--------------------------------------------------------------------------*/
/* Remap functions                                                          */
/*--------------------------------------------------------------------------*/

void remap_16(uint8 *src, uint16 *dst, uint16 *table, int length)
{
    int count;
    for(count = 0; count < length; count += 1)
    {
        *dst++ = table[*src++];
    }
}

/*--------------------------------------------------------------------------*/
/* Merge functions                                                          */
/*--------------------------------------------------------------------------*/

void merge(uint8 *srca, uint8 *srcb, uint8 *dst, uint8 *table, int width)
{
  while(width--)
  {
    *(dst++) = table[(*(srcb++)<<8) | *(srca++)];
  }
}

/*--------------------------------------------------------------------------*/
/* Color update functions                                                   */
/*--------------------------------------------------------------------------*/
void xcolor_update_16(int index, uint16 data)
{
  uint32 dst = data;

  dst = ((dst&0x007)<<2)|((dst&0x038)<<4)|((dst&0x1C0)<<6);

  if (index & 0x0F)
    dst |= 0x8000;
}


void color_update_16(int index, uint16 data)
{
  if(reg[12] & 8)
  {
    pixel_16[0x00 | index] = pixel_16_lut[0][data];
    pixel_16[0x40 | index] = pixel_16_lut[1][data];
    pixel_16[0x80 | index] = pixel_16_lut[2][data];
  }
  else
  {
    uint16 temp = pixel_16_lut[1][data];
    pixel_16[0x00 | index] = temp;
    pixel_16[0x40 | index] = temp;
    pixel_16[0x80 | index] = temp;
  }
}

/*--------------------------------------------------------------------------*/
/* Object render functions                                                  */
/*--------------------------------------------------------------------------*/

void parse_satb(int line)
{
  static int sizetab[] = {8, 16, 24, 32};

  uint8 *p, *q;
  uint16 ypos;

  int count;
  int link = 0;

  int q3val;

  int limit = 16 + ((reg[12] & 1) << 2);
  int total = limit << 2;

  object_index_count = 0;

  for(count = 0; count < total; count += 1)
  {
    q = &sat[link];

    ypos = ((*(uint16 *)&q[0])>>im2_flag) & 0x1FF;

    q3val = q[3];

    if((line >= ypos) && (line < (ypos + sizetab[q3val & 3])))
    {
      p = &vram[satb + link];

      object_info[object_index_count].ypos = *(uint16 *)&q[0];
      object_info[object_index_count].xpos = *(uint16 *)&p[6];

      // using xpos from internal satb stops sprite x
      // scrolling in bloodlin.bin,
      // but this seems to go against the test prog
      // object_info[object_index_count].xpos = *(uint16 *)&q[6];
      object_info[object_index_count].attr = *(uint16 *)&p[4];
      object_info[object_index_count].size = q3val;
      //object_info[object_index_count].index = count;

      if(++object_index_count == limit)
      {
        if(vint_pending == 0)
          status |= 0x40;
        return;
      }
    }

    // Follow the link field
		if (!(link = q[2] & 0x7F))
			break;

    link <<= 3;
  }
}

void render_obj(int line, uint8 *buf, uint8 *table)
{
  uint16 ypos;
  uint16 attr;

  uint16 xpos;  
  uint8 size;
  uint8 *src;

  uint8 sizetab[] = {8, 16, 24, 32};

  int count;
  int pixellimit = 256 + ((reg[12] & 1) << 6);

  int pixelcount = 0;
  int width;

  int height;
  int v_line;

  int column;
  uint8 sol_flag = 0;

  int left = 0x80;
  int right = 0x80 + pixellimit;

  uint8 *s, *lb;
  uint16 name, index;
  uint8 palette;

  int attr_mask, nt_row;

  if(object_index_count == 0) return;

  for(count = 0; count < object_index_count; count += 1)
  {
    size = object_info[count].size & 0x0f;
    xpos = object_info[count].xpos & 0x1ff;

    width = sizetab[(size >> 2) & 3];

    if(xpos != 0)
      sol_flag = 1;
    else if (sol_flag)
      return;

    if(pixelcount > pixellimit)
      return;
    pixelcount += width;

    if ((xpos < right) && ((xpos + width) >= left))
    {
      ypos = object_info[count].ypos & 0x1ff;

      attr = object_info[count].attr;
      attr_mask = (attr & 0x1800);

      height = sizetab[size & 3];
      palette = (attr >> 9) & 0x70;

      v_line = (line - ypos);
      nt_row = (v_line >> 3) & 3;
      v_line = (v_line & 7) << 3;

      name = (attr & 0x07FF);
      s = &name_lut[((attr >> 3) & 0x300) | (size << 4) | (nt_row << 2)];

      lb = (uint8 *)&buf[0x20 + (xpos - 0x80)];

      width >>= 3;
      for(column = 0; column < width; column += 1, lb+=8)
      {
        index = attr_mask | ((name + s[column]) & 0x07FF);
        src = &bg_pattern_cache[(index << 6) | (v_line)];
        DRAW_SPRITE_TILE;
      }
    }
  }
}

void render_obj_im2(int line, uint8 *buf, uint8 *table)
{
    uint16 ypos;
    uint16 attr;
    uint16 xpos;
    uint8 sizetab[] = {8, 16, 24, 32};
    uint8 size;
    uint8 *src;

    int count;
    int pixellimit = 256 + ((reg[12] & 1) << 6);
    int pixelcount = 0;
    int width;
    int height;
    int v_line;
    int column;
    int sol_flag = 0;
    int left = 0x80;
    int right = 0x80 + pixellimit;

    uint8 *s, *lb;
    uint16 name, index;
    uint8 palette;
    uint32 offs;

    int attr_mask, nt_row;

    if(object_index_count == 0) return;

    for(count = 0; count < object_index_count; count += 1)
    {
        size = object_info[count].size & 0x0f;
        xpos = object_info[count].xpos;
        xpos &= 0x1ff;

        width = sizetab[(size >> 2) & 3];

        if(xpos != 0) sol_flag = 1;
        else
        if(xpos == 0 && sol_flag) return;

        if(pixelcount > pixellimit) return;
        pixelcount += width;

        if(((xpos + width) >= left) && (xpos < right))
        {
            ypos = object_info[count].ypos;
            ypos = (ypos >> 1) & 0x1ff;

            attr = object_info[count].attr;
            attr_mask = (attr & 0x1800);

            height = sizetab[size & 3];
            palette = (attr >> 9) & 0x70;

            v_line = (line - ypos);
            nt_row = (v_line >> 3) & 3;
            v_line = (((v_line & 7) << 1) | ((status >> 4) & 1)) << 3;            

            name = (attr & 0x03FF);
            s = &name_lut[((attr >> 3) & 0x300) | (size << 4) | (nt_row << 2)];

            lb = (uint8 *)&buf[0x20 + (xpos - 0x80)];

            width >>= 3;
            for(column = 0; column < width; column += 1, lb+=8)
            {
                index = (name + s[column]) & 0x3ff;
                offs = index << 7 | attr_mask << 6 | v_line;
                if(attr & 0x1000) offs ^= 0x40;
                src = &bg_pattern_cache[offs];
                DRAW_SPRITE_TILE;
            }
        }
    }
}

/* SWRENDER_END */
