diff --git a/include/fixmath.c b/include/fixmath.c
index 7624dbc..23d194e 100644
--- a/include/fixmath.c
+++ b/include/fixmath.c
@@ -123,8 +123,9 @@ int lmul4f12s(int x, int y)
 
 		lda #0
 		sta	accu + 1
-L2:		
+
 		bcc	W4
+L2:		
 		tay
 		clc
 		lda	accu + 1
@@ -165,7 +166,7 @@ W1:
 		bcc W2
 
 		tay
-		sec
+//		sec 		; we know it is set here
 		lda accu + 1
 		sbc y
 		sta accu + 1
diff --git a/include/gfx/bitmap.c b/include/gfx/bitmap.c
index f25f788..ad796e0 100644
--- a/include/gfx/bitmap.c
+++ b/include/gfx/bitmap.c
@@ -525,7 +525,7 @@ void bm_polygon_nc_fill(const Bitmap * bm, const ClipRect * clip, int * px, int
 static inline void buildline(char ly, char lx, int dx, int dy, int stride, bool left, bool up, char pattern, LineOp op)
 {
 	char	ip = 0;
-	bool	delta16 = ((dx | dy) & 0xff80) != 0;
+	bool	delta16 =((dx | dy) & 0xff80) != 0;
 
 	// ylow
 	ip += asm_im(BLIT_CODE + ip, ASM_LDY, ly);
@@ -579,82 +579,105 @@ static inline void buildline(char ly, char lx, int dx, int dy, int stride, bool
 		break;
 	}
 
-	if (dx && dy)
-	{
-		// m >= 0
-		ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP + delta16);
-		ip += asm_rl(BLIT_CODE + ip, ASM_BMI, delta16 ? 5 + 15 + 13 + 2 : 5 + 15 + 7 + 2);
-	}
-
 	if (dy)
 	{
-		ip += asm_np(BLIT_CODE + ip, up ? ASM_DEY : ASM_INY);
-		ip += asm_im(BLIT_CODE + ip, ASM_CPY, up ? 0xff : 0x08);
-		ip += asm_rl(BLIT_CODE + ip, ASM_BNE, 15);
+		bool	delta8 = false;
 
-		ip += asm_np(BLIT_CODE + ip, ASM_CLC);
-		ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_SP);
-		ip += asm_im(BLIT_CODE + ip, ASM_ADC, stride & 0xff);
-		ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_SP);
-		ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_SP + 1);
-		ip += asm_im(BLIT_CODE + ip, ASM_ADC, stride >> 8);
-		ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_SP + 1);
-		ip += asm_im(BLIT_CODE + ip, ASM_LDY, up ? 0x07 : 0x00);
-	}
-
-	if (dx && dy)
-	{
-		ip += asm_np(BLIT_CODE + ip, ASM_SEC);
-		ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP);
-		ip += asm_im(BLIT_CODE + ip, ASM_SBC, dx & 0xff);
-		ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP);
-
-		if (delta16)
+		if (dx)
 		{
-			ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP + 1);
-			ip += asm_im(BLIT_CODE + ip, ASM_SBC, dx >> 8);
-			ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP + 1);
+			// m >= 0
+			ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP + delta16);
+			char n = delta16 ? 18 + 13 + 2 : 18 + 7 + 2;
+			if (!up) n++;
+			ip += asm_rl(BLIT_CODE + ip, ASM_BMI, n);
+			delta8 = !delta16;
+		}
+
+		if (up)
+		{
+			ip += asm_np(BLIT_CODE + ip, ASM_DEY);
+			ip += asm_rl(BLIT_CODE + ip, ASM_BPL, delta8 ? 17 : 15);
+			ip += asm_np(BLIT_CODE + ip, ASM_CLC);
+			ip += asm_im(BLIT_CODE + ip, ASM_LDY, 0x07);
+			ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_SP);
+			ip += asm_im(BLIT_CODE + ip, ASM_ADC, stride & 0xff);
+			ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_SP);
+			ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_SP + 1);
+			ip += asm_im(BLIT_CODE + ip, ASM_ADC, stride >> 8);
+			ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_SP + 1);
+		}
+		else
+		{
+			ip += asm_np(BLIT_CODE + ip, ASM_INY);
+			ip += asm_im(BLIT_CODE + ip, ASM_CPY, 0x08);
+			ip += asm_rl(BLIT_CODE + ip, ASM_BNE, delta8 ? 16 : 14);
+			ip += asm_im(BLIT_CODE + ip, ASM_LDY, 0x00);
+			ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_SP);
+			ip += asm_im(BLIT_CODE + ip, ASM_ADC, (stride - 1) & 0xff);
+			ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_SP);
+			ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_SP + 1);
+			ip += asm_im(BLIT_CODE + ip, ASM_ADC, (stride - 1) >> 8);
+			ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_SP + 1);
+		}
+
+		if (dx)
+		{
+			ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP);
+			ip += asm_np(BLIT_CODE + ip, ASM_SEC);
+			ip += asm_im(BLIT_CODE + ip, ASM_SBC, dx & 0xff);
+			ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP);
+
+			if (delta16)
+			{
+				ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP + 1);
+				ip += asm_im(BLIT_CODE + ip, ASM_SBC, dx >> 8);
+				ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP + 1);
+				ip += asm_rl(BLIT_CODE + ip, ASM_BPL, 13 + 4 + 12);
+
+				ip += asm_np(BLIT_CODE + ip, ASM_CLC);
+				ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP);
+				ip += asm_im(BLIT_CODE + ip, ASM_ADC, dy & 0xff);
+				ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP);
+				ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP + 1);
+				ip += asm_im(BLIT_CODE + ip, ASM_ADC, dy >> 8);
+				ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP + 1);
+			}
+			else
+			{
+				// We know regdp to be in the accu at this point
+				ip += asm_rl(BLIT_CODE + ip, ASM_BPL, 5 + 4 + 12);
+				ip += asm_np(BLIT_CODE + ip, ASM_CLC);
+				ip += asm_im(BLIT_CODE + ip, ASM_ADC, dy & 0xff);
+				ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP);
+			}
 		}
 
 		// m < 0
-		ip += asm_rl(BLIT_CODE + ip, ASM_BPL, delta16 ? 4 + 13 + 13 : 4 + 13 + 7);
 	}
 
 	if (dx)
 	{
 		ip += asm_zp(BLIT_CODE + ip, left ? ASM_ASL : ASM_LSR, REG_D0);
-		ip += asm_rl(BLIT_CODE + ip, ASM_BCC, 13);
+		ip += asm_rl(BLIT_CODE + ip, ASM_BCC, 12);
 
 		ip += asm_zp(BLIT_CODE + ip, left ? ASM_ROL : ASM_ROR, REG_D0);
-		ip += asm_np(BLIT_CODE + ip, ASM_CLC);
+
 		ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_SP);
-		ip += asm_im(BLIT_CODE + ip, ASM_ADC, left ? 0xf8 : 0x08);
-		ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_SP);
 
 		if (left)
 		{
+			ip += asm_im(BLIT_CODE + ip, ASM_ADC, 0xf8);
 			ip += asm_rl(BLIT_CODE + ip, ASM_BCS, 2);
 			ip += asm_zp(BLIT_CODE + ip, ASM_DEC, REG_SP + 1);
 		}
 		else
 		{
+			ip += asm_im(BLIT_CODE + ip, ASM_ADC, 0x08);
 			ip += asm_rl(BLIT_CODE + ip, ASM_BCC, 2);
 			ip += asm_zp(BLIT_CODE + ip, ASM_INC, REG_SP + 1);
 		}
-	}
 
-	if (dx && dy)
-	{
-		ip += asm_np(BLIT_CODE + ip, ASM_CLC);
-		ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP);
-		ip += asm_im(BLIT_CODE + ip, ASM_ADC, dy & 0xff);
-		ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP);
-		if (delta16)
-		{
-			ip += asm_zp(BLIT_CODE + ip, ASM_LDA, REG_DP + 1);
-			ip += asm_im(BLIT_CODE + ip, ASM_ADC, dy >> 8);
-			ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_DP + 1);
-		}
+		ip += asm_zp(BLIT_CODE + ip, ASM_STA, REG_SP);
 	}
 
 	// l --
diff --git a/oscar64/InterCode.cpp b/oscar64/InterCode.cpp
index ee20015..019cd04 100644
--- a/oscar64/InterCode.cpp
+++ b/oscar64/InterCode.cpp
@@ -13613,7 +13613,7 @@ bool InterCodeBasicBlock::MoveTrainCrossBlock(void)
 						FastNumberSet	nset(mEntryRequiredTemps.Size());
 
 						InterInstruction* ins(mInstructions[i]);
-						if (ins->mCode == IC_STORE)
+						if (ins->mCode == IC_STORE && ins->mSrc[0].mFinal)
 						{
 							for (int k = 0; k < ins->mNumOperands; k++)
 							{
@@ -22996,7 +22996,7 @@ void InterCodeProcedure::Close(void)
 {
 	GrowingTypeArray	tstack(IT_NONE);
 
-	CheckFunc = !strcmp(mIdent->mString, "test");
+	CheckFunc = !strcmp(mIdent->mString, "main");
 	CheckCase = false;
 
 	mEntryBlock = mBlocks[0];
diff --git a/oscar64/NativeCodeGenerator.cpp b/oscar64/NativeCodeGenerator.cpp
index 99ab5d3..682cdff 100644
--- a/oscar64/NativeCodeGenerator.cpp
+++ b/oscar64/NativeCodeGenerator.cpp
@@ -37694,7 +37694,7 @@ bool NativeCodeBasicBlock::OptimizeSimpleLoopInvariant(NativeCodeProcedure* proc
 	while (ai < mIns.Size() && !mIns[ai].ChangesAccu())
 		ai++;
 
-	if (ai < mIns.Size() && !(mIns[ai].mLive & LIVE_CPU_REG_Z))
+	if (ai < mIns.Size() && !(mIns[ai].mLive & LIVE_CPU_REG_Z) && !mEntryRequiredRegs[CPU_REG_A])
 	{
 		if (mIns[ai].mType == ASMIT_LDA && mIns[ai].mMode == ASMIM_IMMEDIATE)
 		{
@@ -51811,7 +51811,7 @@ void NativeCodeProcedure::Compile(InterCodeProcedure* proc)
 	mInterProc = proc;
 	mInterProc->mLinkerObject->mNativeProc = this;
 
-	CheckFunc = !strcmp(mInterProc->mIdent->mString, "bmmcu_line");
+	CheckFunc = !strcmp(mInterProc->mIdent->mString, "main");
 
 	int	nblocks = proc->mBlocks.Size();
 	tblocks = new NativeCodeBasicBlock * [nblocks];
diff --git a/samples/hires/cube3d.c b/samples/hires/cube3d.c
index 6ef119b..5c5f0f7 100644
--- a/samples/hires/cube3d.c
+++ b/samples/hires/cube3d.c
@@ -48,7 +48,7 @@ struct Point
 };
 
 
-Point	tcorners[8], pcorners[8];
+__striped	Point	tcorners[8], pcorners[8];
 
 void drawCube(void)
 {
@@ -77,6 +77,45 @@ void hideCube(void)
 	}
 }
 
+void xorCube(void)
+{
+	for(char i=0; i<8; i++)
+	{
+		if (!(i & 1))
+			bm_line(&Screen, &cr, tcorners[i].x, tcorners[i].y, tcorners[i | 1].x, tcorners[i | 1].y, 0xff, LINOP_XOR);
+		if (!(i & 2))
+			bm_line(&Screen, &cr, tcorners[i].x, tcorners[i].y, tcorners[i | 2].x, tcorners[i | 2].y, 0xff, LINOP_XOR);
+		if (!(i & 4))
+			bm_line(&Screen, &cr, tcorners[i].x, tcorners[i].y, tcorners[i | 4].x, tcorners[i | 4].y, 0xff, LINOP_XOR);
+		pcorners[i] = tcorners[i];
+	}
+}
+
+void xor2Cube(void)
+{
+	for(char i=0; i<8; i++)
+	{
+		if (!(i & 1))
+		{
+			bm_line(&Screen, &cr, pcorners[i].x, pcorners[i].y, pcorners[i | 1].x, pcorners[i | 1].y, 0xff, LINOP_XOR);
+			bm_line(&Screen, &cr, tcorners[i].x, tcorners[i].y, tcorners[i | 1].x, tcorners[i | 1].y, 0xff, LINOP_XOR);
+		}
+		if (!(i & 2))
+		{
+			bm_line(&Screen, &cr, pcorners[i].x, pcorners[i].y, pcorners[i | 2].x, pcorners[i | 2].y, 0xff, LINOP_XOR);
+			bm_line(&Screen, &cr, tcorners[i].x, tcorners[i].y, tcorners[i | 2].x, tcorners[i | 2].y, 0xff, LINOP_XOR);
+		}
+		if (!(i & 4))
+		{
+			bm_line(&Screen, &cr, pcorners[i].x, pcorners[i].y, pcorners[i | 4].x, pcorners[i | 4].y, 0xff, LINOP_XOR);
+			bm_line(&Screen, &cr, tcorners[i].x, tcorners[i].y, tcorners[i | 4].x, tcorners[i | 4].y, 0xff, LINOP_XOR);
+		}
+	}
+
+	for(char i=0; i<8; i++)
+		pcorners[i] = tcorners[i];
+}
+
 #if 1
 
 F12Vector3	corners[8];
@@ -112,8 +151,15 @@ int main(void)
 			tcorners[i].y = lmuldiv16s(vd.v[1], 140, vd.v[2] + 4 * FIX12_ONE) + 100;
 		}
 
+#if 1
+		if (k)
+			xor2Cube();
+		else
+			xorCube();
+#else
 		hideCube();
 		drawCube();
+#endif
 
 	}