[Project Log] Python on the 6502/C64, 8080, 6800, 6809 and AVR

And converting a tuple to a string. it even handles nested tuples! This one violates two of Zach’s rules of microcontrollers: it uses dynamic memory allocation and it is recursive.

 			  02240	;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 			  02241	;
 			  02242	; tuple.str - Convert a tuple to string.
 			  02243	;
 			  02244	; Input:
 			  02245	;	Ptr0 = address of the object
 			  02246	;
 			  02247	; Output:
 			  02248	;	Ptr0 = address of the string
 			  02249	;
 			  02250	; Uses:
 			  02251	;	Int3 - the number of tuple objects left
 			  02252	;	Ptr3 - the address of the next object
 			  02253	;	Ptr4 - the string in progress
 			  02254	;
 0C4C			  02255	tuple.str
 0C4C A5 10	      [3] 02256		lda	Ptr0			; Get address of next object
 0C4E 85 16	      [3] 02257		sta	Ptr3			; Note: increment before use
 0C50 A5 11	      [3] 02258		lda	Ptr0+1
 0C52 85 17	      [3] 02259		sta	Ptr3+1
 			  02260
 0C54 A0 01	      [2] 02261		ldy	#1			; Get number of objects
 0C56 B1 10	    [5/6] 02262		lda	(Ptr0),Y
 0C58 85 0C	      [3] 02263		sta	Int3
 0C5A A9 00	      [2] 02264		lda	#0
 0C5C 85 0D	      [3] 02265		sta	Int3+1
 			  02266
 0C5E			  02267	tuple.str0				; Hook for future big tuple type
 0C5E			  02268	tuple.str1
 0C5E A9 05	      [2] 02269		lda	#2+3			; Start the output string
 0C60 85 06	      [3] 02270		sta	Int0			; Room for empty tuple '()'
 0C62 A9 00	      [2] 02271		lda	#0			;   and string header
 0C64 85 07	      [3] 02272		sta	Int0+1
 0C66 20 03D6	      [6] 02273		jsr	Alloc
 0C69 A5 10	      [3] 02274		lda	Ptr0
 0C6B 85 18	      [3] 02275		sta	Ptr4
 0C6D A5 11	      [3] 02276		lda	Ptr0+1
 0C6F 85 19	      [3] 02277		sta	Ptr4+1
 			  02278
 0C71 05 10	      [3] 02279		ora	Ptr0			;*** Out of memory check here
 			  02280
 0C73 A9 01	      [2] 02281		lda	#1
 0C75 85 03	      [3] 02282		sta	Byt0			; First time
 			  02283
 0C77 A0 00	      [2] 02284		ldy	#0			; Initially '('
 0C79 A9 81	      [2] 02285		lda	#$81
 0C7B 91 18	      [6] 02286		sta	(Ptr4),Y
 0C7D C8	      [2] 02287		iny
 0C7E A9 01	      [2] 02288		lda	#1
 0C80 91 18	      [6] 02289		sta	(Ptr4),Y
 0C82 C8	      [2] 02290		iny
 0C83 A9 00	      [2] 02291		lda	#0
 0C85 91 18	      [6] 02292		sta	(Ptr4),Y
 0C87 C8	      [2] 02293		iny
 0C88 A9 28	      [2] 02294		lda	#'('
 0C8A 91 18	      [6] 02295		sta	(Ptr4),Y
 			  02296
 0C8C A5 0C	      [3] 02297		lda	Int3			; Any more objects?
 0C8E 05 0D	      [3] 02298		ora	Int3+1
 0C90 D0 03 (0C95)  [2/3] 02299		bne	tuple.str2
 			  02300
 0C92 4C 0D6B	      [3] 02301		jmp	tuple.str6
 			  02302
 0C95			  02303	tuple.str2
 0C95 A5 0C	      [3] 02304		lda	Int3			; Decrement object count
 0C97 38	      [2] 02305		sec
 0C98 E9 01	      [2] 02306		sbc	#1
 0C9A 85 0C	      [3] 02307		sta	Int3
 0C9C A5 0D	      [3] 02308		lda	Int3+1
 0C9E E9 00	      [2] 02309		sbc	#0
 0CA0 85 0D	      [3] 02310		sta	Int3+1
 			  02311
 0CA2 A5 16	      [3] 02312		lda	Ptr3			; Increment object pointer
 0CA4 18	      [2] 02313		clc
 0CA5 69 02	      [2] 02314		adc	#2
 0CA7 85 16	      [3] 02315		sta	Ptr3
 0CA9 A5 17	      [3] 02316		lda	Ptr3+1
 0CAB 69 00	      [2] 02317		adc	#0
 0CAD 85 17	      [3] 02318		sta	Ptr3+1
 			  02319
 0CAF A0 00	      [2] 02320		ldy	#0			; Get next object
 0CB1 B1 16	    [5/6] 02321		lda	(Ptr3),Y
 0CB3 85 10	      [3] 02322		sta	Ptr0
 0CB5 C8	      [2] 02323		iny
 0CB6 B1 16	    [5/6] 02324		lda	(Ptr3),Y
 0CB8 85 11	      [3] 02325		sta	Ptr0+1
 			  02326
 0CBA A5 0D	      [3] 02327		lda	Int3+1			; Save my variables
 0CBC 48	      [3] 02328		pha
 0CBD A5 0C	      [3] 02329		lda	Int3
 0CBF 48	      [3] 02330		pha
 0CC0 A5 17	      [3] 02331		lda	Ptr3+1
 0CC2 48	      [3] 02332		pha
 0CC3 A5 16	      [3] 02333		lda	Ptr3
 0CC5 48	      [3] 02334		pha
 0CC6 A5 19	      [3] 02335		lda	Ptr4+1
 0CC8 48	      [3] 02336		pha
 0CC9 A5 18	      [3] 02337		lda	Ptr4
 0CCB 48	      [3] 02338		pha
 0CCC A5 03	      [3] 02339		lda	Byt0
 0CCE 48	      [3] 02340		pha
 			  02341
 0CCF 20 0D79	      [6] 02342		jsr	object.str		; Convert object to string
 			  02343
 0CD2 68	      [4] 02344		pla				; Recover my variables
 0CD3 85 03	      [3] 02345		sta	Byt0
 0CD5 68	      [4] 02346		pla
 0CD6 85 18	      [3] 02347		sta	Ptr4
 0CD8 68	      [4] 02348		pla
 0CD9 85 19	      [3] 02349		sta	Ptr4+1
 0CDB 68	      [4] 02350		pla
 0CDC 85 16	      [3] 02351		sta	Ptr3
 0CDE 68	      [4] 02352		pla
 0CDF 85 17	      [3] 02353		sta	Ptr3+1
 0CE1 68	      [4] 02354		pla
 0CE2 85 0C	      [3] 02355		sta	Int3
 0CE4 68	      [4] 02356		pla
 0CE5 85 0D	      [3] 02357		sta	Int3+1
 			  02358
 0CE7 A5 10	      [3] 02359		lda	Ptr0			; Save address of new string fragment
 0CE9 85 14	      [3] 02360		sta	Ptr2
 0CEB A5 11	      [3] 02361		lda	Ptr0+1
 0CED 85 15	      [3] 02362		sta	Ptr2+1
 			  02363
 0CEF A0 01	      [2] 02364		ldy	#1			; Get length of new string fragment
 0CF1 B1 14	    [5/6] 02365		lda	(Ptr2),Y
 0CF3 18	      [2] 02366		clc
 0CF4 69 06	      [2] 02367		adc	#3+3			; Plus room for ', ', ')'
 0CF6 85 06	      [3] 02368		sta	Int0			;   and string header
 0CF8 C8	      [2] 02369		iny
 0CF9 B1 14	    [5/6] 02370		lda	(Ptr2),Y
 0CFB 69 00	      [2] 02371		adc	#0
 0CFD 85 07	      [3] 02372		sta	Int0+1
 			  02373
 0CFF 88	      [2] 02374		dey				; Add length of current string
 0D00 B1 18	    [5/6] 02375		lda	(Ptr4),Y
 0D02 65 06	      [3] 02376		adc	Int0
 0D04 85 06	      [3] 02377		sta	Int0
 0D06 C8	      [2] 02378		iny
 0D07 B1 18	    [5/6] 02379		lda	(Ptr4),Y
 0D09 65 07	      [3] 02380		adc	Int0+1
 0D0B 85 07	      [3] 02381		sta	Int0+1
 			  02382
 0D0D 20 03D6	      [6] 02383		jsr	Alloc			; Allocate a new string
 			  02384
 0D10 A5 10	      [3] 02385		lda	Ptr0			; Remember the new string address
 0D12 85 1A	      [3] 02386		sta	Ptr5
 0D14 A5 11	      [3] 02387		lda	Ptr0+1
 0D16 85 1B	      [3] 02388		sta	Ptr5+1
 			  02389
 0D18 05 10	      [3] 02390		ora	Ptr0			;*** Out of memory check here
 			  02391
 0D1A 20 0C28	      [6] 02392		jsr	CopyString		; Copy current string to new
 			  02393
 0D1D A5 18	      [3] 02394		lda	Ptr4			; Free the old string
 0D1F 85 10	      [3] 02395		sta	Ptr0
 0D21 A5 19	      [3] 02396		lda	Ptr4+1
 0D23 85 11	      [3] 02397		sta	Ptr0+1
 0D25 20 0480	      [6] 02398		jsr	Free
 			  02399
 0D28 A5 1A	      [3] 02400		lda	Ptr5			; Use the enlarged string
 0D2A 85 18	      [3] 02401		sta	Ptr4
 0D2C A5 1B	      [3] 02402		lda	Ptr5+1
 0D2E 85 19	      [3] 02403		sta	Ptr4+1
 			  02404
 0D30 A5 03	      [3] 02405		lda	Byt0			; Is this the first one?
 0D32 D0 0A (0D3E)  [2/3] 02406		bne	tuple.str3		; Branch if yes
 			  02407
 0D34 A9 2C	      [2] 02408		lda	#','			; Add ', '
 0D36 20 0BDF	      [6] 02409		jsr	ConcatChar
 0D39 A9 20	      [2] 02410		lda	#' '
 0D3B 20 0BDF	      [6] 02411		jsr	ConcatChar
 			  02412
 0D3E			  02413	tuple.str3
 0D3E 20 0BF9	      [6] 02414		jsr	ConcatString		; Add the new object string
 			  02415
 0D41 A5 14	      [3] 02416		lda	Ptr2			; Free the new string fragment
 0D43 85 10	      [3] 02417		sta	Ptr0
 0D45 A5 15	      [3] 02418		lda	Ptr2+1
 0D47 85 11	      [3] 02419		sta	Ptr0+1
 0D49 20 0480	      [6] 02420		jsr	Free
 			  02421
 0D4C A5 03	      [3] 02422		lda	Byt0			; Is this the first one?
 0D4E F0 12 (0D62)  [2/3] 02423		beq	tuple.str4		; Branch if no
 			  02424
 0D50 A9 00	      [2] 02425		lda	#0			; No longer the first one
 0D52 85 03	      [3] 02426		sta	Byt0
 			  02427
 0D54 A5 0C	      [3] 02428		lda	Int3			; Was this the only object?
 0D56 05 0D	      [3] 02429		ora	Int3+1
 0D58 D0 0E (0D68)  [2/3] 02430		bne	tuple.str5		; Branch if no
 			  02431
 0D5A A9 2C	      [2] 02432		lda	#','			; Add ','
 0D5C 20 0BDF	      [6] 02433		jsr	ConcatChar
 			  02434
 0D5F 4C 0D6B	      [3] 02435		jmp	tuple.str6
 			  02436
 0D62			  02437	tuple.str4
 0D62 A5 0C	      [3] 02438		lda	Int3			; Any more objects?
 0D64 05 0D	      [3] 02439		ora	Int3+1
 0D66 F0 03 (0D6B)  [2/3] 02440		beq	tuple.str6
 			  02441
 0D68			  02442	tuple.str5
 0D68 4C 0C95	      [3] 02443		jmp	tuple.str2
 			  02444
 0D6B			  02445	tuple.str6
 0D6B A9 29	      [2] 02446		lda	#')'			; Close the tuple
 0D6D 20 0BDF	      [6] 02447		jsr	ConcatChar
 			  02448
 0D70 A5 18	      [3] 02449		lda	Ptr4			; Return string in Ptr0
 0D72 85 10	      [3] 02450		sta	Ptr0
 0D74 A5 19	      [3] 02451		lda	Ptr4+1
 0D76 85 11	      [3] 02452		sta	Ptr0+1
 			  02453
 0D78 60	      [6] 02454		rts
2 Likes

That’s okay, the 6502 isn’t a microcontroller! :slight_smile:

1 Like

That’s all its progeny are today…

Because it’s not there…

I took on learning Python because it appeared to be a language with demand. Then came realization that it is so popular because it is today’s BASIC. For good and bad. Implementing some of the Pythonics was an irresistible challenge.

5 Likes

Because its better than running Node.js or even Lua on IoT/embedded devices. Seriously, have you tried nodemcu? A little bloated imho. Sure lua is great but python is an everyday language (just as Basic and TCL/Tk was) while javascript is a different beast all together.

6 posts were split to a new topic: Embedded Language Wars

The 6502 instruction set is kind of spare when you compare it with, for example, the 6809 (or even the lesser 6800, Z80, and 8080). However, even when you have to use several instructions in place of a single 6809 instruction, the number of cycles ends up being comparable. The code density is what suffers, and that’s because the 6502 is intentionally RISC. For many small embedded applications, the code density is comparable and the 6502 uses many fewer cycles.

A good macro assembler can make the 6502 much less tedious and can improve readability.

We did get an Amiga 1200 in and that is hooked up right now but the c128 will still be around for your use and we’ll have to find a way to secure it on the wall while not in use.

2 Likes

Actually, the 6800 and 6809 are very similar to the 6502 in that they take one machine cycle to do a memory read or write or ALU operation, unlike the 8080 or Z80 which takes around 3 T cycles. If you can get it into the X register, the 680x can do a 16-bit increment or decrement in 4 machine cycles. The 680x has two capable accumulators. The advantage the 6502 has is that it pipelines the fetch of the next instruction.

My assembler does macros. But there are many flavors of operations: zeropage or absolute addressing versus indirect using register Y, simple increment or decrement versus adding or subtracting a constant or another variable. Then there are standalone operations versus ones to be done in a sequence where a little of gain can be had by keeping a byte in the X register. If I create macros to cover every case, it will begin looking like the ARM with its conditional execution and optional updating of condition codes.

This is sort of apples and oranges, but this is a block copy subroutine for the 6502:

 0344 A0 00	      [2] 00498	MovBlk	ldy	#0			; Start of first page
 			  00499
 0346 A6 07	      [3] 00500		ldx	Int0+1			; At least one full page remaining?
 0348 F0 07 (0351)  [2/3] 00501		beq	MovBlk0			; Branch if no
 			  00502
 034A A2 00	      [2] 00503		ldx	#0			; Set up to move an entire page
 034C C6 07	      [5] 00504		dec	Int0+1			; One fewer whole page remaining
 			  00505
 034E 4C 0357	      [3] 00506		jmp	MovBlk1
 			  00507
 0351 A6 06	      [3] 00508	MovBlk0	ldx	Int0			; Any remaining on a partial page?
 0353 F0 0F (0364)  [2/3] 00509		beq	MovBlk2			; No
 			  00510
 0355 84 06	      [3] 00511		sty	Int0			; This will finish the partial page
 			  00512
 0357 B1 10	    [5/6] 00513	MovBlk1	lda	(Ptr0),Y		; Move a byte
 0359 91 12	      [6] 00514		sta	(Ptr1),Y
 035B C8	      [2] 00515		iny
 035C CA	      [2] 00516		dex				; More to move on this page?
 035D D0 F8 (0357)  [2/3] 00517		bne	MovBlk1			; Yes
 			  00518
 035F E6 11	      [5] 00519		inc	Ptr0+1			; Address next page
 			  00520
 0361 4C 0344	      [3] 00521		jmp	MovBlk			; Check for another page
 			  00522
 0364 60	      [6] 00523	MovBlk2	rts

While this is the one for the 6800:

 0171 B6 016F	      [4] 00213	Copy_    ldaa   CopyC_
 0174 BA 0170	      [4] 00214	         oraa   CopyC_+1
 0177 27 21 (019A)    [4] 00215	         beq    Copy2_
 0179 FE 016B	      [5] 00216	Copy1_   ldx    CopyS_
 017C A6 00	      [5] 00217	         ldaa   ,X
 017E 08	      [4] 00218	         inx
 017F FF 016B	      [6] 00219	         stx    CopyS_
 0182 FE 016D	      [5] 00220	         ldx    CopyD_
 0185 A7 00	      [6] 00221	         staa   ,X
 0187 08	      [4] 00222	         inx
 0188 FF 016D	      [6] 00223	         stx    CopyD_
 018B 7A 0170	      [6] 00224	         dec    CopyC_+1
 018E 26 E9 (0179)    [4] 00225	         bne    Copy1_
 0190 7D 016F	      [6] 00226	         tst    CopyC_
 0193 27 05 (019A)    [4] 00227	         beq    Copy2_
 0195 7A 016F	      [6] 00228	         dec    CopyC_
 0198 26 DF (0179)    [4] 00229	         bne    Copy1_
 019A 39	      [5] 00230	Copy2_   rts

The 6800 code does not use variables in the direct page; if it did, it would take one fewer cycle for each instruction which read or wrote a variable (Copy?_)

Edit: …and for the 8080:

 0000			  00001	BlkMov:
 0000 2A 0017	     [16] 00002		lhld	Src
 0003 EB	      [4] 00003		xchg
 0004 2A 0019	     [16] 00004		lhld	Count
 0007 44	      [5] 00005		mov	B,H
 0008 4D	      [5] 00006		mov	C,L
 0009 2A 0015	     [16] 00007		lhld	Dest
 			  00008
 000C			  00009	Loop:
 000C 1A	      [7] 00010		ldax	D
 000D 77	      [7] 00011		mov	M,A
 000E 23	      [5] 00012		inx	H
 000F 13	      [5] 00013		inx	D
 0010 0B	      [5] 00014		dcx	B
 0011 C2 000C	     [10] 00015		jnz	Loop
 			  00016
 0014 C9	     [10] 00017		ret

Further edit; and for the AVR, the controller on the arduino:

 000000 9610	      [2] 00005	Copy:	adiw	R26,0
 000001 F041=00000A [1/2] 00006		breq	Copy2
 000002 E161	      [1] 00007		ldi	R22,high(SRAM_START+SRAM_SIZE)
 000003 30E0	      [1] 00008		cpi	R30,low(SRAM_START+SRAM_SIZE)
 000004 07F6	      [1] 00009		cpc	R31,R22
 000005 F428=00000B [1/2] 00010		brcc	Copy3
 000006 9161	      [2] 00011	Copy1:	ld	R22,Z+
 000007 9369	      [2] 00012		st	Y+,R22
 000008 9711	      [2] 00013		sbiw	R26,1
 000009 F7E1=000006 [1/2] 00014		brne	Copy1
 00000A 9508	      [2] 00015	Copy2:	ret
 00000B 9165	      [3] 00016	Copy3:	lpm	R22,Z+
 00000C 9369	      [2] 00017		st	Y+,R22
 00000D 9711	      [2] 00018		sbiw	R26,1
 00000E F7E1=00000B [1/2] 00019		brne	Copy3
 00000F 9508	      [2] 00020		ret

Edit once again; and the the champion of the 8-bitters, the 6809;

    1   0000 FE   0015     Copy    ldu    Src		; 6 cycles
    2   0003 10BE 0017             ldy    Dest		; 7 cycles
    3   0007 BE   0019             ldx    Count		; 6 cycles
    4   000A 27   08               beq    Done		; 2 cycles
    5                      
    6   000C A6   C0       Loop    lda    ,U+		; 4+2 cycles
    7   000E A7   A0               sta    ,Y+		; 5+2 cycles
    8   0010 30   1F               leax   -1,X		; 4+1 cycles
    9   0012 26   F8               bne    Loop		; 2 cycles
   10                      
   11   0014 39            Done    rts			; 5 cycles

I do not have one for the 8080/Z80 handy, but it can use BC, DE and HL to indirectly access memory, so it would use a lot less memory access switching pointers.

Looking at the 6800 code now, I could replace

 018B 7A 0170	      [6] 00224	         dec    CopyC_+1
 018E 26 E9 (0179)    [4] 00225	         bne    Copy1_
 0190 7D 016F	      [6] 00226	         tst    CopyC_
 0193 27 05 (019A)    [4] 00227	         beq    Copy2_
 0195 7A 016F	      [6] 00228	         dec    CopyC_
 0198 26 DF (0179)    [4] 00229	         bne    Copy1_

with something like (not correct code, just similar instructions to what would be needed):

 0179 FE 016B	      [5] 00216	Copy1_   ldx    CopyS_
 017E 08	      [4] 00218	         inx
 017F FF 016B	      [6] 00219	         stx    CopyS_
 0198 26 DF (0179)    [4] 00229	         bne    Copy1_

Edit: looking back at it, I would not do the replacement. In 255 out of 256 cases, the branch at the second instruction would be taken back to the top of the loop after only 10 cycles.

Further edit: CopyC_+1 could be kept in the B register for a savings of 4 cycles each time around the inner loop.

Looking back at the block copy routines, the 6800 is the worst processor in this particular case. The lack of a second index register severely hurt it. I do not have a version of the code for the 6809, but it should do much better since it has two additional index registers.

The 6502 does very well as the zero page locations can be effectively used for indexing. The 8080 did OK because its register pairs can be used for indexing.

In terms of raw speed, the AVR shines. It has many registers and several of them have fancy autoincrement modes. But it has a very limited amount of static RAM, so a general purpose computer it is not.

Edit: It appears that the 6502, with its funky indirect indexed addressing mode, managed to beat the 6809 by 2 cycles in the inner loop of a block copy.

An interesting exercise for another day is to compare how each one does implementing the FORTH inner interpreter.

Many of these chips have as much or more then the 70’s and 80’s general purpose computers. Some as much as 256 kbytes of RAM.

Anyway, back to the Python compiler. I go from hard to very hard. The print function is done except for handling the keyword arguments.

In Python, variables are just a reference to an object. Essentially nothing but a pointer. A variable can reference an object of any type; a boolean, an integer, a real number, a string or even a function. The run-time code must determine the type and do something reasonable with it.

The bad part of it, as least for the compiler implementer, is that nothing is known at compile time about the number and types of the arguments to a function. Adding to the complexity is that you can have traditional positional arguments, and then some keyword arguments and they can be assigned a default value if the function call does not specify one.

Languages like Pascal have been criticized because parts of the language are “special.” Write and writeln cannot be replaced or extended without modifying the compiler. Python, at least Python 3, does not have that problem. I can essentially write:

def myPrint(*pargs, **kargs):
        print(*pargs, **kargs)

and alter the way the print function works. I can even do

        oldPrint = print
        print = myPrint

and replace the provided one with mine.

But that power comes with a heavy price. Especially with Python’s dynamic typing. The burden is on the run-time code to figure out how to map the arguments when a function is called.

The AVR does not. I do not know whether the larger MSP430s do. They are also Harvard architecture, making them less suitable for general purpose use loading arbitrary application code from a storage device. The ARM is the notable exception to these limitations and why it is so popular today.

Your correct, I just looked at the data sheet and they seem to max out at 16K of RAM, which is still more then enough to run a general purpose computer, and with some of the larger pin devices it wouldn’t be hard to add more external RAM that could be paged in/out of the chip.

It certainly wouldn’t be a viable design, but it is doable, and would be an interesting exercise in retro inspired computer design. If UNIX could be developed on a PDP/7, it would certainly be possible to create a version for the AVR…

I remembered this

So it would be possible to implement an AVR on an FPGA with as many resources as one wants… Can’t you see it, and AVR with a PDP/11 front panel running UNIX and supporting the native debugging of Arduino and Assembly code?

Yep and it is possible to programmatically change your flash on the AVR with a running program. So a simple monitor like program could load from and SD, or through the serial port and place the code in flash and allow you to run it. Haven’t looked at AVR assembly and don’t remember if there is any where for the code to single step…

There is a way for the code to write to flash memory since the bootloader does exactly that. I never learned to do that, but it was something I will eventually have to do to really finish my FORTH implementation for the AVR.

I do not remember there being a single step bit or interrupt like the x86. You may have to add some external hardware to yank an NMI to do that.

I originally wrote this on the x86 16-bit following the roadmap in the Byte book on Threaded Interpretive Languages:

    656				     ;-----------------------------------------------------------------------------
    657				     ;
    658				     ; Inner interpreter implementation
    659				     ;
    660	00E5			     CODE    ends
    661	0000			     DICT    segment
    662
    663				     ;
    664				     ; SEMI does not have a header, but	has a standard word address
    665				     ;
    666	0000  00E5r		     _SEMI   dw	     offset SEMI
Turbo Assembler	 Version 3.2	    05/06/15 15:45:56	    Page 10
tilli.ASM
TILLI.ASM - Threaded Interpretive Language Little Implementation


    667
    668	0002			     DICT    ends
    669	00E5			     CODE    segment
    670
    671	00E5			     SEMI:
    672				     public  SEMI
    673	00E5  8B 76 00			     mov     SI,[BP]		     ; Pop return address
    674	00E8  83 C5 02			     add     BP,2
    675
    676	00EB			     NEXT:
    677				     public  NEXT
    678	00EB  8B 3C			     mov     DI,[SI]		     ; Get next	word address
    679	00ED  83 C6 02			     add     SI,2
    680
    681	00F0			     RUN:
    682				     public  RUN
    683	00F0  8B 1D			     mov     BX,[DI]		     ; Run a threaded word
    684	00F2  83 C7 02			     add     DI,2
    685	00F5  FF E3			     jmp     BX
    686
    687	00F7			     __COLON:
    688				     public  __COLON
    689	00F7  83 ED 02			     sub     BP,2		     ; Push instruction	register
    690	00FA  89 76 00			     mov     [BP],SI
    691	00FD  8B F7			     mov     SI,DI		     ; Point to	nested secondary
    692	00FF  EB EA			     jmp     short NEXT
    693
    694
    695				     ;-----------------------------------------------------------------------------
    696				     ;
    697				     ; EXECUTE ( addr -- )
    698				     ;
    699				     ; Execute dictionary entry	at compilation address on stack; for example,
    700				     ; address returned	by FIND.
    701				     ;
    702	0101			     CODE    ends
    703	0002			     DICT    segment
    704
    705					     header  <'EXECUTE'>
1   706	0002  0000			     dw	     PREV_ENTRY
1   707	0004  07			     db	     offset ??0007 - offset $ -	1    ; the 'EXECUTE' length
1   708	0005  45 58 45 43 55 54	45	     db	     'EXECUTE'
1   709	000C			     ??0007  label   byte
    710
    711	000C  0101r		     _EXECUTE	     dw	     offset __EXECUTE
    712
    713	000E			     DICT    ends
    714	0101			     CODE    segment
    715
    716	0101			     __EXECUTE:
    717	0101  5F			     pop     DI			     ; Get word	address	of the word
    718	0102  EB EC			     jmp     Run

From there, I ported it to the 6800;

 			  00532	;=== < Inner interpreter >====================================================
 			  00533
 			  00534	;-----------------------------------------------------------------------------
 			  00535	;
 			  00536	; SEMI does not have a header, but has a standard word address
 			  00537	;
 016E 0170		  00538	_SEMI    fdb    SEMI
 			  00539
 0170			  00540	SEMI:
 0170 DE 00	      [4] 00541	         ldx    RS        ; Pop IR from return stack
 0172 08	      [4] 00542	         inx
 0173 08	      [4] 00543	         inx
 0174 DF 00	      [5] 00544	         stx    RS
 0176 EE 00	      [6] 00545	         ldx    ,X
 0178 DF 02	      [5] 00546	         stx    IR
 			  00547	;	mov	SI,[BP]			; Pop return address
 			  00548	;	add	BP,2
 			  00549
 017A			  00550	Next:
 017A DE 02	      [4] 00551	         ldx    IR
 017C 08	      [4] 00552	Next1    inx
 017D 08	      [4] 00553	         inx
 017E DF 02	      [5] 00554	         stx    IR
 0180 EE 00	      [6] 00555	         ldx    ,X
 			  00556	;	mov	DI,[SI]			; Get next word address
 			  00557	;	add	SI,2
 			  00558
 0182			  00559	RUN:                      ; WA in X
 0182 DF 04	      [5] 00560	         stx    WA        ; Run machine code of new word
DEI Research 6800 Cross Assembler  Version 0.0   05-16-2015 112:47:26  Page 11
tilli.a68

 Addr Code	   Cycles Line#	  Source Statement

 0184 EE 00	      [6] 00561	Run1     ldx    ,X
 0186 6E 00	      [4] 00562	         jmp    ,X
 			  00563	;	mov	BX,[DI]			; Run a threaded word
 			  00564	;	add	DI,2
 			  00565	;	jmp	BX
 			  00566
 0188			  00567	__COLON:
 0188 DE 00	      [4] 00568	         ldx    RS        ; Push instruction register
 018A D6 03	      [3] 00569	         ldab   IR+1      ; on return stack
 018C 09	      [4] 00570	         dex
 018D E7 02	      [6] 00571	         stab   2,X
 018F 96 02	      [3] 00572	         ldaa   IR
 0191 09	      [4] 00573	         dex
 0192 A7 02	      [6] 00574	         staa   2,X
 0194 DF 00	      [5] 00575	         stx    RS
 			  00576
 0196 DE 04	      [4] 00577	         ldx    WA        ; Execute new secondary
 0198 20 E2 (017C)    [4] 00578	         bra    Next1
 			  00579	;	sub	BP,2			; Push instruction register
 			  00580	;	mov	[BP],SI
 			  00581	;	mov	SI,DI			; Point to nested secondary
 			  00582	;	jmp	short NEXT

And finally the AVR:

 			  00572	;=== < Inner interpreter >====================================================
 			  00573
 			  00574	;-----------------------------------------------------------------------------
 			  00575	;
 			  00576	; SEMI does not have a header, but has a standard word address
 			  00577	;
 			  00578	.init
 0146 0102		  00579	?SEMI:	.dw	SEMI
 			  00580	.cseg
 			  00581
 000102 01F7	      [1] 00582	SEMI:	movw	R30,R14			; Pop return address
 000103 91A1	      [2] 00583		ld	R26,Z+
 000104 91B1	      [2] 00584		ld	R27,Z+
 000105 017F	      [1] 00585		movw	R14,R30
 			  00586	;	mov	SI,[BP]
 			  00587	;	add	BP,2
 			  00588
 000106 91CD	      [2] 00589	Next:	ld	R28,X+			; Get next word address in secondary
 000107 91DD	      [2] 00590		ld	R29,X+
 			  00591	;	mov	DI,[SI]
 			  00592	;	add	SI,2
 			  00593
 000108 91E9	      [2] 00594	Run:	ld	R30,Y+			; Run the word
 000109 91F9	      [2] 00595		ld	R31,Y+
 00010A 9409	      [2] 00596		ijmp
 			  00597	;	mov	BX,[DI]
 			  00598	;	add	DI,2
 			  00599	;	jmp	BX
 			  00600
 00010B			  00601	?COLON:
 00010B 01F7	      [1] 00602		movw	R30,R14			; Push instruction register
 00010C 93B2	      [2] 00603		st	-Z,R27
 00010D 93A2	      [2] 00604		st	-Z,R26
 00010E 017F	      [1] 00605		movw	R14,R30
 00010F 01DE	      [1] 00606		movw	R26,R28			; Point to nested secondary
 000110 CFF5=000106   [2] 00607		rjmp	Next			; And run it
 			  00608	;	sub	BP,2
 			  00609	;	mov	[BP],SI
 			  00610	;	mov	SI,DI
 			  00611	;	jmp	short Next
 			  00612
 			  00613	;-----------------------------------------------------------------------------
 			  00614	;
 			  00615	; EXECUTE ( addr -- )
 			  00616	;
 			  00617	; Execute dictionary entry at compilation address on stack; for example,
 			  00618	; address returned by FIND.
 			  00619	;
 			  00620	.init
 			  00621	header	"EXECUTE",0
+         =00000148		 .set	THIS_ENTRY	= PC
+0148 0138			 .dw	PREV_ENTRY
+         =00000148		 .set	PREV_ENTRY	= THIS_ENTRY
+014A 0745584543555445		 .db	0 | strlen("EXECUTE"),"EXECUTE"
DEI Research AVR Cross Assembler  Version 0.0   Jun-23-2015 57:32:32  Page 12
tilli.ASM

  Addr   Code	   Cycles Line#	  Source Statement

 0152 0111		  00622	_EXECUTE:	.dw	__EXECUTE
 			  00623	.cseg
 			  00624
 000111			  00625	__EXECUTE:
 000111 91CF	      [2] 00626		pop	R28			; Get word address of the new word
 000112 91DF	      [2] 00627		pop	R29
 000113 CFF4=000108   [2] 00628		rjmp	Run
 			  00629	;	pop	DI
 			  00630	;	jmp	Run