Optimize hires line draw

2024-12-15 16:34:53 +01:00 · 2024-12-15 16:34:53 +01:00 · 4fce263228
parent 50c7e10814
commit 4fce263228
5 changed files with 127 additions and 57 deletions
--- a/include/fixmath.c
+++ b/include/fixmath.c
@ -123,8 +123,9 @@ int lmul4f12s(int x, int y)
 		lda #0
 		sta	accu + 1
-L2:		
+
 		bcc	W4
 L2:		
 		tay
 		clc
 		lda	accu + 1
@ -165,7 +166,7 @@ W1:
 		bcc W2
 		tay
-		sec
+//		sec 		; we know it is set here
 		lda accu + 1
 		sbc y
 		sta accu + 1
--- a/include/gfx/bitmap.c
+++ b/include/gfx/bitmap.c
@ -525,7 +525,7 @@ void bm_polygon_nc_fill(const Bitmap * bm, const ClipRect * clip, int * px, int
 static inline void buildline(char ly, char lx, int dx, int dy, int stride, bool left, bool up, char pattern, LineOp op)
 {
 	char	ip = 0;
-	bool	delta16 = ((dx | dy) & 0xff80) != 0;
+	bool	delta16 =((dx | dy) & 0xff80) != 0;
 	// ylow
 	ip += asm_im(BLIT_CODE + ip, ASM_LDY, ly);
@ -579,82 +579,105 @@ static inline void buildline(char ly, char lx, int dx, int dy, int stride, bool
 		break;
 	}
 	if (dx && dy)
 	{
 		// m >= 0
 		ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP + delta16);
 		ip += asm_rl(BLIT_CODE + ip, ASM_BMI, delta16 ? 5 + 15 + 13 + 2 : 5 + 15 + 7 + 2);
 	}
 	if (dy)
 	{
-		ip += asm_np(BLIT_CODE + ip, up ? ASM_DEY : ASM_INY);
+		bool	delta8 = false;
 		ip += asm_im(BLIT_CODE + ip, ASM_CPY, up ? 0xff : 0x08);
 		ip += asm_rl(BLIT_CODE + ip, ASM_BNE, 15);
-		ip += asm_np(BLIT_CODE + ip, ASM_CLC);
+		if (dx)
 		ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_SP);
 		ip += asm_im(BLIT_CODE + ip, ASM_ADC, stride & 0xff);
 		ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_SP);
 		ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_SP + 1);
 		ip += asm_im(BLIT_CODE + ip, ASM_ADC, stride >> 8);
 		ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_SP + 1);
 		ip += asm_im(BLIT_CODE + ip, ASM_LDY, up ? 0x07 : 0x00);
 	}
 	if (dx && dy)
 	{
 		ip += asm_np(BLIT_CODE + ip, ASM_SEC);
 		ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP);
 		ip += asm_im(BLIT_CODE + ip, ASM_SBC, dx & 0xff);
 		ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP);
 		if (delta16)
 		{
-			ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP + 1);
+			// m >= 0
-			ip += asm_im(BLIT_CODE + ip, ASM_SBC, dx >> 8);
+			ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP + delta16);
-			ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP + 1);
+			char n = delta16 ? 18 + 13 + 2 : 18 + 7 + 2;
 			if (!up) n++;
 			ip += asm_rl(BLIT_CODE + ip, ASM_BMI, n);
 			delta8 = !delta16;
 		}
 		if (up)
 		{
 			ip += asm_np(BLIT_CODE + ip, ASM_DEY);
 			ip += asm_rl(BLIT_CODE + ip, ASM_BPL, delta8 ? 17 : 15);
 			ip += asm_np(BLIT_CODE + ip, ASM_CLC);
 			ip += asm_im(BLIT_CODE + ip, ASM_LDY, 0x07);
 			ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_SP);
 			ip += asm_im(BLIT_CODE + ip, ASM_ADC, stride & 0xff);
 			ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_SP);
 			ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_SP + 1);
 			ip += asm_im(BLIT_CODE + ip, ASM_ADC, stride >> 8);
 			ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_SP + 1);
 		}
 		else
 		{
 			ip += asm_np(BLIT_CODE + ip, ASM_INY);
 			ip += asm_im(BLIT_CODE + ip, ASM_CPY, 0x08);
 			ip += asm_rl(BLIT_CODE + ip, ASM_BNE, delta8 ? 16 : 14);
 			ip += asm_im(BLIT_CODE + ip, ASM_LDY, 0x00);
 			ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_SP);
 			ip += asm_im(BLIT_CODE + ip, ASM_ADC, (stride - 1) & 0xff);
 			ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_SP);
 			ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_SP + 1);
 			ip += asm_im(BLIT_CODE + ip, ASM_ADC, (stride - 1) >> 8);
 			ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_SP + 1);
 		}
 		if (dx)
 		{
 			ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP);
 			ip += asm_np(BLIT_CODE + ip, ASM_SEC);
 			ip += asm_im(BLIT_CODE + ip, ASM_SBC, dx & 0xff);
 			ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP);
 			if (delta16)
 			{
 				ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP + 1);
 				ip += asm_im(BLIT_CODE + ip, ASM_SBC, dx >> 8);
 				ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP + 1);
 				ip += asm_rl(BLIT_CODE + ip, ASM_BPL, 13 + 4 + 12);
 				ip += asm_np(BLIT_CODE + ip, ASM_CLC);
 				ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP);
 				ip += asm_im(BLIT_CODE + ip, ASM_ADC, dy & 0xff);
 				ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP);
 				ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP + 1);
 				ip += asm_im(BLIT_CODE + ip, ASM_ADC, dy >> 8);
 				ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP + 1);
 			}
 			else
 			{
 				// We know regdp to be in the accu at this point
 				ip += asm_rl(BLIT_CODE + ip, ASM_BPL, 5 + 4 + 12);
 				ip += asm_np(BLIT_CODE + ip, ASM_CLC);
 				ip += asm_im(BLIT_CODE + ip, ASM_ADC, dy & 0xff);
 				ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP);
 			}
 		}
 		// m < 0
 		ip += asm_rl(BLIT_CODE + ip, ASM_BPL, delta16 ? 4 + 13 + 13 : 4 + 13 + 7);
 	}
 	if (dx)
 	{
 		ip += asm_zp(BLIT_CODE + ip, left ? ASM_ASL : ASM_LSR, REG_D0);
-		ip += asm_rl(BLIT_CODE + ip, ASM_BCC, 13);
+		ip += asm_rl(BLIT_CODE + ip, ASM_BCC, 12);
 		ip += asm_zp(BLIT_CODE + ip, left ? ASM_ROL : ASM_ROR, REG_D0);
-		ip += asm_np(BLIT_CODE + ip, ASM_CLC);
+
 		ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_SP);
 		ip += asm_im(BLIT_CODE + ip, ASM_ADC, left ? 0xf8 : 0x08);
 		ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_SP);
 		if (left)
 		{
 			ip += asm_im(BLIT_CODE + ip, ASM_ADC, 0xf8);
 			ip += asm_rl(BLIT_CODE + ip, ASM_BCS, 2);
 			ip += asm_zp(BLIT_CODE + ip, ASM_DEC, REG_SP + 1);
 		}
 		else
 		{
 			ip += asm_im(BLIT_CODE + ip, ASM_ADC, 0x08);
 			ip += asm_rl(BLIT_CODE + ip, ASM_BCC, 2);
 			ip += asm_zp(BLIT_CODE + ip, ASM_INC, REG_SP + 1);
 		}
 	}
-	if (dx && dy)
+		ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_SP);
 	{
 		ip += asm_np(BLIT_CODE + ip, ASM_CLC);
 		ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP);
 		ip += asm_im(BLIT_CODE + ip, ASM_ADC, dy & 0xff);
 		ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP);
 		if (delta16)
 		{
 			ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP + 1);
 			ip += asm_im(BLIT_CODE + ip, ASM_ADC, dy >> 8);
 			ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP + 1);
 		}
 	}
 	// l --
--- a/oscar64/InterCode.cpp
+++ b/oscar64/InterCode.cpp
@ -13613,7 +13613,7 @@ bool InterCodeBasicBlock::MoveTrainCrossBlock(void)
 						FastNumberSet	nset(mEntryRequiredTemps.Size());
 						InterInstruction* ins(mInstructions[i]);
-						if (ins->mCode == IC_STORE)
+						if (ins->mCode == IC_STORE && ins->mSrc[0].mFinal)
 						{
 							for (int k = 0; k < ins->mNumOperands; k++)
 							{
@ -22996,7 +22996,7 @@ void InterCodeProcedure::Close(void)
 {
 	GrowingTypeArray	tstack(IT_NONE);
-	CheckFunc = !strcmp(mIdent->mString, "test");
+	CheckFunc = !strcmp(mIdent->mString, "main");
 	CheckCase = false;
 	mEntryBlock = mBlocks[0];
--- a/oscar64/NativeCodeGenerator.cpp
+++ b/oscar64/NativeCodeGenerator.cpp
@ -37694,7 +37694,7 @@ bool NativeCodeBasicBlock::OptimizeSimpleLoopInvariant(NativeCodeProcedure* proc
 	while (ai < mIns.Size() && !mIns[ai].ChangesAccu())
 		ai++;
-	if (ai < mIns.Size() && !(mIns[ai].mLive & LIVE_CPU_REG_Z))
+	if (ai < mIns.Size() && !(mIns[ai].mLive & LIVE_CPU_REG_Z) && !mEntryRequiredRegs[CPU_REG_A])
 	{
 		if (mIns[ai].mType == ASMIT_LDA && mIns[ai].mMode == ASMIM_IMMEDIATE)
 		{
@ -51811,7 +51811,7 @@ void NativeCodeProcedure::Compile(InterCodeProcedure* proc)
 	mInterProc = proc;
 	mInterProc->mLinkerObject->mNativeProc = this;
-	CheckFunc = !strcmp(mInterProc->mIdent->mString, "bmmcu_line");
+	CheckFunc = !strcmp(mInterProc->mIdent->mString, "main");
 	int	nblocks = proc->mBlocks.Size();
 	tblocks = new NativeCodeBasicBlock * [nblocks];
--- a/samples/hires/cube3d.c
+++ b/samples/hires/cube3d.c
@ -48,7 +48,7 @@ struct Point
 };
-Point	tcorners[8], pcorners[8];
+__striped	Point	tcorners[8], pcorners[8];
 void drawCube(void)
 {
@ -77,6 +77,45 @@ void hideCube(void)
 	}
 }
 void xorCube(void)
 {
 	for(char i=0; i<8; i++)
 	{
 		if (!(i & 1))
 			bm_line(&Screen, &cr, tcorners[i].x, tcorners[i].y, tcorners[i | 1].x, tcorners[i | 1].y, 0xff, LINOP_XOR);
 		if (!(i & 2))
 			bm_line(&Screen, &cr, tcorners[i].x, tcorners[i].y, tcorners[i | 2].x, tcorners[i | 2].y, 0xff, LINOP_XOR);
 		if (!(i & 4))
 			bm_line(&Screen, &cr, tcorners[i].x, tcorners[i].y, tcorners[i | 4].x, tcorners[i | 4].y, 0xff, LINOP_XOR);
 		pcorners[i] = tcorners[i];
 	}
 }
 void xor2Cube(void)
 {
 	for(char i=0; i<8; i++)
 	{
 		if (!(i & 1))
 		{
 			bm_line(&Screen, &cr, pcorners[i].x, pcorners[i].y, pcorners[i | 1].x, pcorners[i | 1].y, 0xff, LINOP_XOR);
 			bm_line(&Screen, &cr, tcorners[i].x, tcorners[i].y, tcorners[i | 1].x, tcorners[i | 1].y, 0xff, LINOP_XOR);
 		}
 		if (!(i & 2))
 		{
 			bm_line(&Screen, &cr, pcorners[i].x, pcorners[i].y, pcorners[i | 2].x, pcorners[i | 2].y, 0xff, LINOP_XOR);
 			bm_line(&Screen, &cr, tcorners[i].x, tcorners[i].y, tcorners[i | 2].x, tcorners[i | 2].y, 0xff, LINOP_XOR);
 		}
 		if (!(i & 4))
 		{
 			bm_line(&Screen, &cr, pcorners[i].x, pcorners[i].y, pcorners[i | 4].x, pcorners[i | 4].y, 0xff, LINOP_XOR);
 			bm_line(&Screen, &cr, tcorners[i].x, tcorners[i].y, tcorners[i | 4].x, tcorners[i | 4].y, 0xff, LINOP_XOR);
 		}
 	}
 	for(char i=0; i<8; i++)
 		pcorners[i] = tcorners[i];
 }
 #if 1
 F12Vector3	corners[8];
@ -112,8 +151,15 @@ int main(void)
 			tcorners[i].y = lmuldiv16s(vd.v[1], 140, vd.v[2] + 4 * FIX12_ONE) + 100;
 		}
 #if 1
 		if (k)
 			xor2Cube();
 		else
 			xorCube();
 #else
 		hideCube();
 		drawCube();
 #endif
 	}