From 91cb2fda154980059ceb082d12c08ced2057784f Mon Sep 17 00:00:00 2001
From: drmortalwombat <90205530+drmortalwombat@users.noreply.github.com>
Date: Sat, 19 Mar 2022 14:35:49 +0100
Subject: [PATCH] Documenting compiler optimizations

---
 README.md                       | 286 +++++++++++++++++++++++++++++++-
 oscar64/NativeCodeGenerator.cpp | 127 +++++++++++++-
 oscar64/NativeCodeGenerator.h   |   1 +
 3 files changed, 400 insertions(+), 14 deletions(-)
diff --git a/README.md b/README.md
index 7772484..9570e18 100644
--- a/README.md
+++ b/README.md
@@ -82,6 +82,35 @@ The compiler is command line driven, and creates an executable .prg file.
 
 A list of source files can be provided.
 
+### Files generated
+
+The main file generated by the compiler is a .prg, .crt or .bin with the code and constant data.  Additional files are created to support debugging and analysis:
+
+#### Map file ".map"
+
+Shows the addresses of all regions, sections and objects.  This is a good place to look, if your generated code turns out to be too large.
+
+#### Assembler source ".asm"
+
+A listing of the generated bytecode and native assembler instructions.  A good place to cross reference when stuck in the machine code monitor.
+
+#### Intermediate code ".int"
+
+A listing of the generated intermediate code.
+
+#### Vice debugger ".lbl"
+
+Creates vice monitor commands to define all static labels.
+
+	al 0801 .startup
+	al 164d .spentry
+	al 174c .BSSStart
+	al 1766 .BSSEnd
+	al 9fd7 .StackEnd
+
+One can load the label file in the monitor using the load_labels (ll) command or provide it on the command line for vice with the "-moncommands" command line argument.
+
+
 ### Building the samples
 
 The windows installer puts the samples into the users documents folder, using the directory "%userprofile%\documents\oscar64\samples".  A batch file *make.bat* is also placed into this directory which invokes the compiler and builds all samples.  It invokes a second batch file in "%userprofile%\documents\oscar64\bin\oscar64.bat" that calls the compiler.
@@ -134,9 +163,6 @@ The character map for string and char constants can be changed with a pragma to
 
     #pragma charmap(char, code [,count])
 
-
-## Language extensions for optimization
-
 ### Additional Optimizer information using __assume()
 
 The compiler can be provided with additional information using the built in function __assume(cond).  This can be useful to mark unreachable code using __assume(false) for e.g. the default of a switch statement.  Another good option is to limit the value range of arguments to allow the compiler using byte operations without the need for integer promotion.
@@ -159,6 +185,35 @@ Or alternatively with a __native storage class specifier
         (*Bitmap)[y >> 3][x >> 3][y & 7] |= 0x80 >> (x & 7);
     }
 
+### Pre-Processor control
+
+The pre processor has additional commands to control the scanner and allow for dynamic code generation including loops.
+
+Assign a computed value to a pre processor macro
+
+	#assign <name> <expression>
+	
+Loop source code 
+	
+	#repeat
+	...
+	#until <condition>
+	
+This sample fills a single screen column with a given color, by generating 25 assigns to absolute addresses.
+	
+	void color_column(char cx, char color)
+	{
+	#assign ry 0
+	#repeat		
+		Color1[40 * ry + cx] = color;
+	#assign ry ry + 1
+	#until ry == 25
+	#undef ry
+	}
+	
+This sample initialy assigns the value 0 to the pre processor macro ry and increments it each time the loop body is replicated.  The loop generates 25 copies of the body, each with a different value for ry.
+
+	
 ### Linker control
 
 The linker includes only objects that are referenced, starting by the startup code into main() and so on.
@@ -583,7 +638,7 @@ The shots usd dynamic created characters to overlay on the background.
 
 ## Implementation Details
 
-
+### The bytecode interpreter
 
 The byte code interpreter is compiled by the compiler itself and placed in the source file "crt.c".  Functions implementing byte codes are marked with a pragma:
 
@@ -604,17 +659,20 @@ The functions are written in 6502 assembly with the __asm keyword
 		jmp startup.exec
     }
 
-The current byte code program counter is (ip),y. The interpreter loop guarantees that y is always <= 128 and can thus be used to index the additional byte code arguments without the need to check the 16 bit pointer.  The interpreter loop itself is quite compact and takes 21 cycles (including the final jump of the byte code function itself).  Moving it to zero page would reduce this by another two cycles but is most likely not worth the waste of temporary space.
+The current byte code program counter is (ip),y. The compiler and interpreter loop guarantees that y is always <= 240 and can thus be used to index the additional byte code arguments without the need to check the 16 bit pointer.  The interpreter loop itself is quite compact and takes 19 cycles (including the final jump of the byte code function itself).  Moving it to zero page would reduce this by another two cycles but is most likely not worth the waste of temporary space.
 
     exec:
         lda (ip), y
         sta execjmp + 1
         iny     
-        bmi incip   
     execjmp:
         jmp     (0x0900)
 
-The intermediate code generator assumes a large number of registers so the zero page is used for this purpose.  The allocation is not yet final:
+Only JMP/BRANCH and NOP bytecodes check for the y register to exceed it's allowed range of 0..240.  The compiler ensures that there are no linear code sequences longer than 240 bytes by inserting NOP bytecodes at appropriate distances.
+
+### Zero page usage
+
+The intermediate code generator assumes a large number of registers so the zero page is used for this purpose.  The allocation is not yet final (and can be changed using pragmas):
 
 * **0x02-0x02** spilling of y register
 * **0x03-0x0c** workspace for mul/div and floating point routines
@@ -627,6 +685,220 @@ The intermediate code generator assumes a large number of registers so the zero
 * **0x43-0x52** caller saved registers
 * **0x53-0x8f** callee saved registers
 
+### Compiler Stages
+
+The compiler uses most of the traditional stages:
+
+#### Scanner/Pre-Processor
+
+The scanner and pre processor phases are interleaved (causing subtle incompatibilites with C) but allowing additional features such as loop expansion.
+
+#### Parser
+
+The parser uses an extended recursive descent algorithm and uses already parsed information to control the parsing.  It greates a graph built of "Declaration" objects.  It performs basic constant folding and limited type checking.  
+
+#### Global Analyzer
+
+The global analyzer looks at the functions of all compilation modules and decides the calling convention and inlining for all functions.
+
+#### Intermediate Code Generator
+
+The intermediate code generator takes the declaration graph and converts it into intermediate code and linker data objects. This step also performs the final type checks and additional constant folding.  Inlining is also performed in this step.
+
+#### Intermediate Code Optimizer
+
+The intermediate code optimizer works on the intermediat code to perform machine agnostic optimizations, such as value forwarding, strength reduction or loop unrolling.  It also tries to build evaluation trains to reduce the amount of temporaries required for execution.
+
+#### Native/Bytecode Generator
+
+Translates the intermediate code into symbolic bytecode or 6502 assembler instructions.  The code generator looks at several instructions at the same time to pick good native instructions.
+
+#### Native/Bytecode Optimizer
+
+The symbolic assembler code is then further optimized using a specialized optimizer.
+
+#### Native/Bytecode Assembler
+
+This stage translates the symbolic assembler / bytecode into binary and makes it available for the linker.  It also claculates the optimal placement of the basic block to take advantage of the single byte branch offsets.
+
+#### Linker
+
+The linker determines all objects that are referenced and discards objects that are not.  It then places the objects into their sections and spreads the sections over the regions.  Then all references between the objects are resolved.  Finaly the prg or crt file is written.
+
+#### Listing generator
+
+The final step is generating various listing files to support debugging.
+
+### Optimizations performed
+
+Besides simple constant folding, optimizations are performed at three stages.   
+
+* Inlining during intermediate code generation.I
+* Transforming the intermediate code
+* Transforming the native/bytecode
+
+#### Intermediate code optimizations
+
+##### Value numbering and forwarding
+
+This step determines duplicate evaluation of the same expression and forwards the result of the first to the second, thus eliminating the duplicate.  Duplicate expressions are frequently the result of array indexing.  This also covers common subrexpression evaluation.
+
+##### Strength reduction
+
+Multiplications and divisions are replaced with shift and/or adds if possible.  E.g. indexing in loops using the index variable may end up with a pointer that is incremented each iteration.
+
+##### Loop Invariants
+
+Expressions that do not change during the execution of a loop are pushed out of the loop and into the block before the loop.
+
+##### Loop unrolling
+
+Simple loops with a fixed number of iterations may be unrolled into a contiguous code sequence.
+
+##### Dead Code Elimination
+
+Instructions that are not executed, or do not contribute to the visible state of the program executed are eliminated.  E.g. assigning a value to a variable that is later not used is removed.
+
+##### Basic Block Merge/Split
+
+Basic blocks are merged and or split to generate longer unbroken linear code sequences.  This e.g. moves the condition of a while loop to the end of the loop and adds an additional condition at the start.
+
+##### Load/Store forwarding
+
+Performs Load-Load and Store-Load forwarding for memory regions that are known to be non aliased.  This also opens new opportunities for constant folding and common sub expression evaluation.  This may also completely eliminate some stores.
+
+##### Integer Range Analysis
+
+The possible value ranges of all integer instructions is analyzed and used to predict results of conditional instructions.  Conditional instructions provide additional bounds for value ranges.  This becomes important during native code generation to simplify 16bit code to 8bit.
+
+The compiler can deduce from the operations performed and the loop boundaries, that the index i can never be outside of the interval 0..10
+
+	char sum(void)
+	{
+		char s = 0;
+		for(int i=0; i<10; i++)
+			s += a[i];
+		return s;
+	}
+	
+The 16bit integer loop variable is replaced with an 8 bit register.
+
+	sum:
+		0927 LDY #$00
+		0929 LDA #$00
+		092b CLC
+		092c ADC $0972,y ; (a + 0)
+		092f INY
+		0930 CPY #$04
+		0932 BCC $092b
+		0934 STA ACCU + 0
+		0936 RTS
+
+##### Single Branch Expressions
+
+Expressions that are only needed in a single branch of control flow are moved onto this branch.
+
+
+The sum of the array elements is only needed when the condition is false:
+
+	char test(bool b)
+	{
+		char	x = a[0] + a[1];
+		if (b) x = 1;
+		return x;
+	}
+
+In this case, the evaluation of the sum is moved into an artifical else case:
+
+	test:
+		090e LDA P0
+		0910 BNE $091c
+		0912 LDA $0923 ; (a + 1)
+		0915 CLC
+		0916 ADC $0922 ; (a + 0)
+		0919 JMP $091e ; (test + 16)
+		091c LDA #$01
+		091e STA ACCU + 0
+		0920 RTS
+
+##### Common head/tail Sequence coalescing
+
+Instructions that are common to all paths of a basic block sequence are moved into their common head or tail block.  This removes duplicate code, frequently removing basic blocks in total.
+
+##### Collect expression trains
+
+The compiler tries to put expression chains into code sequences to reduce register pressure and allow for further optimization in the native code generation step.
+
+
+#### Native code optimizations
+
+Many of the intermediate optimiations steps are repeated on the symbolic assembler instructions, but now using the actual CPU architecture, including registers and addressing modes.
+
+##### Load/Store forwarding
+
+Load to Load and Stores to Load forwarding is performed using the CPU registers if possible or zero page locations otherwise.  Many values are thus never written into memory, but kept in a register.
+
+##### Index forwarding
+
+An index in the X and Y register may may follow the path of a continuously changing index.
+
+##### Address mode simplification
+
+Indirect addressing is reduced to absolute,x or absolute,y if the 16 bit indirect address can be deduced to be an absolute address and the index short enough.
+
+##### Register usage
+
+The X and Y register are used for temporaries that would otherwise be placed in zero page locations.  This is attempted on a function level and on a basic block level.
+
+##### Loop index registers
+
+Loops with a simple 8 bit counter may be changed to use the X or Y register as the loop index.  The direction of a loop may also be changed from up to constant to down to zero if the bounds are known and the index non needed inside the loop.
+
+##### Strength reduction
+
+Multiplications and divisions are replaced with shift and/or adds if possible.
+
+##### Loop Invariants
+
+Loop invariants are moved outside of the loop
+
+##### Loop iteration forwarding
+
+Register values that are known to be identical to a zero page variable at the end of the loop and re loaded at the start of the loop are kept in the register, and the store and load are moved outside of the loop.
+
+##### Dead code elimination
+
+This removes many high byte instructions of 16 bit operations generated by the automatic type promotion to integer required by the C standard.
+
+##### Common head/tail Sequence coalescing
+
+Instructions that are common to all paths of a basic block sequence are moved into their common head or tail block.  This removes duplicate code, frequently removing basic blocks in total.
+
+Both paths store a single byte into the same register
+
+	void color(bool b)
+	{
+		if (b)
+			vic.color_back = VCOL_BLACK;
+		else
+			vic.color_back = VCOL_WHITE;
+	}
+
+The store is moved into the basic block that joins the two branches
+
+	color:
+		094e LDA P0
+		0950 BEQ $0956
+		0952 LDA #$00
+		0954 BEQ $0958
+		0956 LDA #$01
+		0958 STA $d021 
+		095b RTS
+
+##### Peephole optimizations
+
+Various small and local optimizations are performed on the code on a per basic block level.
+
 
 
 
diff --git a/oscar64/NativeCodeGenerator.cpp b/oscar64/NativeCodeGenerator.cpp
index 20b539a..6cedba7 100644
--- a/oscar64/NativeCodeGenerator.cpp
+++ b/oscar64/NativeCodeGenerator.cpp
@@ -714,6 +714,11 @@ bool NativeCodeInstruction::ChangesXReg(void) const
 	return mType == ASMIT_TAX || mType == ASMIT_LDX || mType == ASMIT_INX || mType == ASMIT_DEX || mType == ASMIT_JSR;
 }
 
+bool NativeCodeInstruction::ReferencesAccu(void) const
+{
+	return ChangesAccu() || RequiresAccu();
+}
+
 bool NativeCodeInstruction::ReferencesYReg(void) const
 {
 	return ChangesYReg() || RequiresYReg();
@@ -9539,7 +9544,7 @@ bool NativeCodeBasicBlock::ForwardZpYIndex(void)
 				mIns[i + 0].mType == ASMIT_CLC &&
 				mIns[i + 1].mType == ASMIT_LDA && mIns[i + 1].mMode == ASMIM_ZERO_PAGE && mIns[i + 1].mAddress == yreg &&
 				mIns[i + 2].mType == ASMIT_ADC && mIns[i + 2].mMode == ASMIM_IMMEDIATE && mIns[i + 2].mAddress == yoffset + 1 &&
-				mIns[i + 3].mType == ASMIT_STA && mIns[i + 3].mMode == ASMIM_ZERO_PAGE && mIns[i + 3].mAddress == yreg && 
+				mIns[i + 3].mType == ASMIT_STA && mIns[i + 3].mMode == ASMIM_ZERO_PAGE &&
 				!(mIns[i + 3].mLive & (LIVE_CPU_REG_A | LIVE_CPU_REG_Y | LIVE_CPU_REG_C)))
 			{
 				for (int j = ypred; j < i; j++)
@@ -12961,6 +12966,113 @@ bool NativeCodeBasicBlock::OptimizeSimpleLoopInvariant(NativeCodeProcedure* proc
 		}
 	}
 
+	si = 0;
+	ei = mIns.Size() - 1;
+	while (si < mIns.Size() && !mIns[si].ReferencesAccu())
+		si++;
+	while (ei > si && !mIns[ei].ReferencesAccu())
+		ei--;
+
+	if (si < ei && mIns[si].mType == ASMIT_LDA && mIns[ei].mType == ASMIT_STA && mIns[si].mMode == ASMIM_ZERO_PAGE && mIns[ei].mMode == ASMIM_ZERO_PAGE && mIns[si].mAddress == mIns[ei].mAddress)
+	{
+		int	i = 0;
+		while (i < si && !mIns[i].ChangesZeroPage(mIns[si].mAddress))
+			i++;
+
+		if (i == si)
+		{
+			i = ei + 1;
+			while (i < mIns.Size() && !mIns[i].ChangesZeroPage(mIns[si].mAddress))
+				i++;
+
+			if (i == mIns.Size())
+			{
+				if (!prevBlock)
+					return OptimizeSimpleLoopInvariant(proc);
+
+				i = 0;
+				while (i < si)
+				{
+					mIns[i].mLive |= LIVE_CPU_REG_A;
+					i++;
+				}
+
+				i = ei;
+				while (i < mIns.Size())
+				{
+					mIns[i].mLive |= LIVE_CPU_REG_A;
+					i++;
+				}
+
+				prevBlock->mIns.Push(mIns[si]);
+				mIns.Remove(si);
+				return true;
+			}
+		}
+	}
+
+	if (si + 2 < ei && 
+		mIns[si + 0].mType == ASMIT_LDA && 
+		mIns[si + 1].mType == ASMIT_CLC &&
+		mIns[si + 2].mType == ASMIT_ADC &&
+		mIns[ei].mType == ASMIT_STA && mIns[si + 2].mMode == ASMIM_ZERO_PAGE && mIns[ei].mMode == ASMIM_ZERO_PAGE && mIns[si + 2].mAddress == mIns[ei].mAddress)
+	{
+		int	i = 0;
+		while (i < si && !mIns[i].ChangesZeroPage(mIns[si].mAddress))
+			i++;
+
+		if (i == si)
+		{
+			i = ei + 1;
+			while (i < mIns.Size() && !mIns[i].ChangesZeroPage(mIns[si].mAddress))
+				i++;
+
+			if (i == mIns.Size())
+			{
+				if (!prevBlock)
+					return OptimizeSimpleLoopInvariant(proc);
+
+				i = 0;
+				while (i < si)
+				{
+					mIns[i].mLive |= LIVE_CPU_REG_A;
+					i++;
+				}
+
+				i = ei;
+				while (i < mIns.Size())
+				{
+					mIns[i].mLive |= LIVE_CPU_REG_A;
+					i++;
+				}
+
+				mIns[si + 2].CopyMode(mIns[si + 0]);
+				mIns[si + 0].CopyMode(mIns[ei]);
+				prevBlock->mIns.Push(mIns[si]);
+				mIns.Remove(si);
+				return true;
+			}
+		}
+	}
+
+
+	if (si < ei && mIns[ei].mType == ASMIT_STA && mIns[ei].mMode == ASMIM_ZERO_PAGE)
+	{
+		int	j = 0;
+		while (j < mIns.Size() && (j == ei || !(mIns[j].ChangesZeroPage(mIns[ei].mAddress) || mIns[j].UsesZeroPage(mIns[ei].mAddress))))
+			j++;
+		if (j == mIns.Size())
+		{
+			if (!prevBlock)
+				return OptimizeSimpleLoopInvariant(proc);
+			exitBlock->mIns.Insert(0, mIns[ei]);
+			mIns.Remove(ei);
+			return true;
+		}
+	}
+
+
+
 	if (sz >= 2 && mIns[0].mType == ASMIT_LDY && mIns[0].mMode == ASMIM_ZERO_PAGE)
 	{
 		int	i = mIns.Size() - 1;
@@ -15275,7 +15387,7 @@ bool NativeCodeBasicBlock::PeepHoleOptimizer(NativeCodeProcedure* proc, int pass
 			if (mIns[i + 0].mType == ASMIT_TYA && mIns[i + 1].mType == ASMIT_CLC && mIns[i + 2].mType == ASMIT_ADC && mIns[i + 2].mMode == ASMIM_IMMEDIATE && mIns[i + 3].mType == ASMIT_TAY && !(mIns[i + 3].mLive & (LIVE_CPU_REG_A | LIVE_CPU_REG_C)))
 			{
 				if (mIns[i + 4].mType == ASMIT_LDA && (mIns[i + 4].mMode == ASMIM_IMMEDIATE || mIns[i + 4].mMode == ASMIM_IMMEDIATE_ADDRESS || mIns[i + 4].mMode == ASMIM_ZERO_PAGE) &&
-					mIns[i + 5].mType == ASMIT_STA && !(mIns[i + 5].mLive & (LIVE_CPU_REG_A | LIVE_CPU_REG_Z)))
+					mIns[i + 5].mType == ASMIT_STA && mIns[i + 5].mMode != ASMIM_INDIRECT_Y && !(mIns[i + 5].mLive & (LIVE_CPU_REG_A | LIVE_CPU_REG_Z)))
 				{
 					mIns[i + 4].mLive |= LIVE_CPU_REG_Y;
 					mIns[i + 5].mLive |= LIVE_CPU_REG_Y;
@@ -15288,8 +15400,8 @@ bool NativeCodeBasicBlock::PeepHoleOptimizer(NativeCodeProcedure* proc, int pass
 #if 1
 				else if (i + 6 < mIns.Size() &&
 					mIns[i + 4].mType == ASMIT_LDA && (mIns[i + 4].mMode == ASMIM_IMMEDIATE || mIns[i + 4].mMode == ASMIM_IMMEDIATE_ADDRESS || mIns[i + 4].mMode == ASMIM_ZERO_PAGE) &&
-					mIns[i + 5].mType == ASMIT_STA && 
-					mIns[i + 6].mType == ASMIT_STA && !(mIns[i + 6].mLive & (LIVE_CPU_REG_A | LIVE_CPU_REG_Z)))
+					mIns[i + 5].mType == ASMIT_STA && mIns[i + 5].mMode != ASMIM_INDIRECT_Y &&
+					mIns[i + 6].mType == ASMIT_STA && mIns[i + 6].mMode != ASMIM_INDIRECT_Y && !(mIns[i + 6].mLive & (LIVE_CPU_REG_A | LIVE_CPU_REG_Z)))
 				{
 					mIns[i + 4].mLive |= LIVE_CPU_REG_Y;
 					mIns[i + 5].mLive |= LIVE_CPU_REG_Y;
@@ -17048,7 +17160,8 @@ bool NativeCodeBasicBlock::PeepHoleOptimizer(NativeCodeProcedure* proc, int pass
 							progress = true;
 						}
 					}
-
+#endif
+#if 1
 					if (
 						mIns[i + 0].mType == ASMIT_LDY && mIns[i + 0].mMode == ASMIM_IMMEDIATE &&
 						mIns[i + 1].mType == ASMIT_LDA &&
@@ -17073,7 +17186,6 @@ bool NativeCodeBasicBlock::PeepHoleOptimizer(NativeCodeProcedure* proc, int pass
 							progress = true;
 						}
 					}
-
 #endif
 				}
 
@@ -19412,6 +19524,7 @@ void NativeCodeProcedure::Optimize(void)
 #endif
 #endif
 
+#if 1
 		ResetVisited();
 		if (mEntryBlock->ForwardZpYIndex())
 			changed = true;
@@ -19419,7 +19532,7 @@ void NativeCodeProcedure::Optimize(void)
 		ResetVisited();
 		if (mEntryBlock->ForwardZpXIndex())
 			changed = true;
-
+#endif
 		if (!changed && step < 6)
 		{
 			step++;
diff --git a/oscar64/NativeCodeGenerator.h b/oscar64/NativeCodeGenerator.h
index e86af02..cdeb824 100644
--- a/oscar64/NativeCodeGenerator.h
+++ b/oscar64/NativeCodeGenerator.h
@@ -98,6 +98,7 @@ public:
 	bool ChangesYReg(void) const;
 	bool ChangesXReg(void) const;
 
+	bool ReferencesAccu(void) const;
 	bool ReferencesYReg(void) const;
 	bool ReferencesXReg(void) const;