
	org		0000H
	INCL	"common.def"


;we boot in page 1, so we have a bit of space here


cop0_write_entryLo:
	FIM		r4, DCL1_ENTRYLO
	JUN		cop0_write_fullreg


	org		0007H
	;code now continues from this same address on page 0
	JUN		cpuStart

;gates for jumps back from ops in page 1

	org		0009H
cpuPrvThreeOperandAnd_veneer:		;r12 must be < 4, r13 = DCL to operate on
	LDM		1
	DCL
	SRC		r12
	WMP
	;code continues on page 1 from this addr here (0x0d)

	org		000dH
cpuPrvVeneerReturn:
	BBL		0

	org		000eH
cpuPrvThreeOperandOrr_veneer:		;r12 must be < 4, r13 = DCL to operate on
	LDM		1
	DCL
	SRC		r12
	WMP
	;code continues on page 1 from this addr here (0x12)


	org		0012H
cpuPrvThreeOperandXor_veneer:		;r12 must be < 4, r13 = DCL to operate on
	LDM		1
	DCL
	SRC		r12
	WMP
	;code continues on page 1 from this addr here (0x16)

	org		0016H
cpuPrvThreeOperandNor_veneer:		;r12 must be < 4, r13 = DCL to operate on
	LDM		1
	DCL
	SRC		r12
	WMP
	;code continues on page 1 from this addr here (0x1a)

	org		001aH
cpuPrvMultX_veneer:					;r12 must be < 4 for this to work at all, for us r12 = "is signed", r13 = DCL to operate on (likely 1)
	LDM		1
	DCL
	SRC		r12
	WMP
	;code continues on page 1 from this addr here (0x1e)

	org		001eH
hwAndSwInit_veneer:						;r12 must be < 4 for this to work at all
	LDM		1
	DCL
	SRC		r12
	WMP
	;code continues on page 1 from this addr here (0x22)

	org		0022H
cpuPrvHypercall_veneer:				;r12 must be < 4 for this to work at all
	LDM		1
	DCL
	SRC		r12
	WMP
	;code continues on page 1 from this addr here (0x26)

	org		0026H
cpuPrvPutchar_veneer:				;r12 must be < 4 for this to work at all, r0:r1 is the char
	LDM		1
	DCL
	SRC		r12
	WMP
	;code continues on page 1 from this addr here (0x2a)

	org		002cH
cpuPrvGetchar_veneer:				;r12 must be < 4 for this to work at all
	LDM		1
	DCL
	SRC		r12
	WMP
	;code continues on page 1 from this addr here (0x30)

	org		0030H
cpuPrvDivu_veneer:				;must be entered with DCL=1, tmp64a.lo = denom, hi = num
	FIM		r12, 0
	LDM		1
	SRC		r12
	WMP
	;code continues on page 1 from this addr here (0x35)


cpuStart:	;	init and go

	;4002 specs say that reset will clear them, no need to do it ourselves
	
	FIM		r12, 0
	JMS		hwAndSwInit_veneer


	;we then need to actually link them app into the chain
	;first and last are special, all others are simple
	;due to how we init things, we only need to fix the first entry :)
	;this means that each item N has addr of 80000000, prev of N-1 and next of N+1
	LDM		TLB_ENTRIES_DCL
	DCL
	FIM		r0, 07H		;;r0 = entryIter, r1 = point to VA's top nibble


tlb_init_loop:		;always inits 16 entries even if we have fewer. writing to nonexistenr rams is fine
	SRC		r0		;select entry IDX

	LD		r0		;entry->next = IDX + 1, entry->haveNext = IDX != 15
	IAC				;A = curIdx + 1, C = NOT haveNext (== carry)
	WR0
	LDM		1		;always have next (we'll fix it later)
	WR1

	LDM		8
	WRM				;	->va = 0x80000000

	LD		r0
	DAC				;	A = curIdx - 1, C = havePrev ( == not borrow == carry)
	WR2				;	->prevIdx.lo4 = curIdx - 1

	TCC
	RAL
	WR3				;	->prevIdx.[bit4] = 0, ->havePrevIdx = curIdx != 0

	ISZ		r0, tlb_init_loop

	;fix the first entry's "prevIdx" to be 0 to indicate that it is the head of bucket 0. r0 is currently 0
	SRC		r0
	LDM		0
	WR2

	;fix last entry's "haveNext"
	LDM		1
	DCL
	FIM		r0, DCL1_INDEX_IDX
	SRC		r0
	RD1								;DCL1_NUM_TLB_ENTRIES_M1
	XCH		r0						;r0:r1 points into last entry
	LDM		TLB_ENTRIES_DCL
	DCL
	SRC		r0
	LDM		0
	WR1

	;read SD sector 0 to start of ram, hypercall handler itself will jump to "next_instr" which will set our PC to the current npc val of 0x80000000
	;hyper number is stored in $at == $1, other params are already at zero as needed
	LDM		0
	DCL
	FIM		r0, 1 * 8
	SRC		r0
	LDM		H_STOR_READ
	WRM
	JUN		cpuExtHypercall



skip_next_instr:
	LDM		1									;cpu.npc += 4
	DCL
	FIM		r2, DCL1_NPC
	LDM		4
	JMS		cpuAddNibble
	;fallthrough

next_instr:
	LDM		1
	DCL
	FIM		r2, DCL1_PC							;cpu.pc = cpu.npc
	FIM		r4, DCL1_NPC
	JMS		cpuPrvCopy8_r4r5_to_r2r3_currDCL_noPtrRestore
	FIM		r2, DCL1_NPC						;cpu.npc += 4
	LDM		4
	JMS		cpuAddNibble
	;cpu.inDelaySlot = false;
	LDM		0
	WR0											;DCL1_IN_DELAY_SLOT
	;fallthrough	

emu_cycle:										;MUST be entered with DCL1 selected

	FIM		r2, DCL1_RANDOM	
	SRC		r2

	RD0		;DCL1_RTC_COUNTER[0]
	IAC
	WR0
	JCN		NZ, rtc_tic_not_yet

	RD1
	RAR
	JCN		NC, ship_show_pc_and_incr_rand		;do the next thingss once every 32 instrs

	RDM											;random--
	DAC
;TLB: these next lines lines only needed if we have a TLB entry count that is not 16. They are safe to execute regardless, though.
	JCN		C, no_rnd_udf
	FIM		r4, DCL1_INDEX_IDX
	SRC		r4
	RD1											;DCL1_NUM_TLB_ENTRIES_M1
	SRC		r2
;TLB: end conditional on tlb size
no_rnd_udf:
	WRM

	;show PC
	FIM		r0, DCL1_PC							;r0 = src
	FIM		r6, 0EH								;r6 = dst dcl, r7 = outer counter

show_pc_outer:
	FIM		r4, 0CH								;r4:r5 = dst chip  (r5  does not matter), r5 = counter_inner

show_pc_inner:
	SRC		r0
	RDM
	XCH		r6
	DCL
	XCH		r6
	SRC		r4
	WMP
	INC		r1
	CLC
	LDM		4
	ADD		r4
	XCH		r4
	LDM		1
	DCL
	ISZ		r5, show_pc_inner

	LDM		2
	XCH		r6
	ISZ		r7, show_pc_outer

	SRC		r2

ship_show_pc_and_incr_rand:
	RD1		;DCL1_RTC_COUNTER[1]
	IAC
	WR1
	JCN		NZ, rtc_tic_not_yet
	RD2		;DCL1_RTC_COUNTER[2]
	IAC
	WR2
	JCN		NZ, rtc_tic_not_yet
	RD3		;DCL1_RTC_COUNTER[3]
	IAC
	WR3
	JCN		NZ, rtc_tic_not_yet

rtc_tick:
	
	;set irq
	FIM		r4, DCL1_CAUSE + 3
	SRC		r4
	RDM
	RAR
	RAR
	STC
	RAL
	RAL
	WRM


check_irq:
	JCN		T, dz11_no_data_to_rx
	FIM		r2, DCL1_DZ11_FLAGS
	SRC		r2
	RD2									;DCL1_DZ11_HAVE_RX
	JCN		NZ, dz11_buffer_still_full	;we avoid ever overflowing the buffer by not RXing when buffer is not empty

	FIM		r12, 0
	JMS		cpuPrvGetchar_veneer		;returns in r0:r1, C = got a byte
	JCN		NC, dz11_no_data_to_rx

dz11_got_char:
	FIM		r2, DCL1_DZ11_RX_BUFPTR
	SRC		r2
	LD		r1
	WR1									;DCL1_DZ11_RX_BUF_LO
	LD		r0
	WR0									;DCL1_DZ11_RX_BUF_HI
	FIM		r0, DCL1_DZ11_FLAGS
	SRC		r0
	LDM		8
	WR2									;DCL1_DZ11_HAVE_RX
	JMS		dz11_recalc
	JUN		dz11_rx_handled

dz11_no_data_to_rx:
dz11_buffer_still_full:
dz11_rx_handled:
rtc_tic_handled:						;we got here if we did cause an irq
	JMS		cpuPrvIrqsRecalc

rtc_tic_not_yet:

check_irqs:
	;in theory 8 irq sources exist, encoded in bits CAUSE[8..15], with active-ghigh enables in STATUS[8..15]
	;the bottom 2 are sw-controlled, the remaining 6 are hardware irqs
	;on the Decstation 2100, irq assignments are: 2 = SCSI, 3 = ethernet, 4 = DZ11 uart, 5 = DS1287 RTC, 7 = bus interface unit
	;of these we only ever have UART and RTC IRQs used, so we only analize the bottom 2 bits of each nibble of CAUSE and STATUS
	;this is faster. Not supporting soft-triggered IRQs is even faster and linux seems ok with it!

	FIM		r4, DCL1_PC
	SRC		r4
	RD1												;DLC1_IRQPENDING
	JCN		Z, no_irqs_now

irq_found:
	FIM		r0, CP0_EXC_COD_IRQ * 4
	JUN		take_exc_normal_vec

no_irqs_now:										;tmp32B = PC
	FIM		r2, DCL1_TMP32B
	JMS		cpuPrvCopy8_r4r5_to_r2r3_currDCL_noPtrRestore
	LDM		MEM_READ_INSTR
	XCH		r11
	FIM		r8, 0DDH
	JMS		mem_access_do_with_calced_aligned_addr	;only returns on success. data will be in TMP32C

	DB		03H		;hyper 3

	JUN		main_decode_dispatch


instr_lwc1:
instr_swc1:
instr_ldc1:
instr_sdc1:
	JMS		cpuPrvCopAccess_1
	JUN		instr_undef




	org		0100H
;64-entry jumptable based on top 6 bits of instr
	JUN		instr_top_lvl_0
	JUN		instr_top_lvl_1
	JUN		instr_j
	JUN		instr_jal
	JUN		instr_beq
	JUN		instr_bne
	JUN		instr_blez
	JUN		instr_bgtz
	JUN		instr_addi
	JUN		instr_addiu
	JUN		instr_slti
	JUN		instr_sltiu
	JUN		instr_andi
	JUN		instr_ori
	JUN		instr_xori
	JUN		instr_lui
	JUN		instr_cop0
	JUN		instr_cop1
	JUN		instr_undef		;cop2
	JUN		instr_cop3
	JUN		instr_beql
	JUN		instr_bnel
	JUN		instr_undef		;22
	JUN		instr_undef		;23
	JUN		instr_undef		;24
	JUN		instr_undef		;25
	JUN		instr_undef		;26
	JUN		instr_undef		;27
	JUN		instr_undef		;28
	JUN		instr_undef		;29
	JUN		instr_undef		;30
	JUN		instr_undef		;31
	JUN		instr_lb
	JUN		instr_lh
	JUN		instr_lwl
	JUN		instr_lw
	JUN		instr_lbu
	JUN		instr_lhu
	JUN		instr_lwr
	JUN		instr_undef		;39
	JUN		instr_sb
	JUN		instr_sh
	JUN		instr_swl
	JUN		instr_sw
	JUN		instr_undef		;44
	JUN		instr_undef		;45
	JUN		instr_swr
	JUN		instr_undef		;47
	JUN		instr_ll
	JUN		instr_lwc1
	JUN		instr_undef		;lwc2
	JUN		next_instr		;51 = PREF
	JUN		instr_undef		;52
	JUN		instr_ldc1
	JUN		instr_undef		;ldc2
	JUN		instr_undef		;55
	JUN		instr_sc
	JUN		instr_swc1
	JUN		instr_undef		;swc2
	JUN		instr_undef		;59
	JUN		instr_undef		;60
	JUN		instr_sdc1
	JUN		instr_undef		;sdc2
	JUN		instr_undef		;63


main_decode_dispatch:

	FIM		r2, DCL1_TMP32C

	;load it into regs. watch the order
	SRC		r2
	INC		r3
	RDM
	XCH		r15
	
	SRC		r2
	INC		r3
	RDM
	XCH		r14
	
	SRC		r2
	INC		r3
	RDM
	XCH		r13
	
	SRC		r2
	INC		r3
	RDM
	XCH		r12
	
	SRC		r2
	INC		r3
	RDM
	XCH		r11
	
	SRC		r2
	INC		r3
	RDM
	XCH		r10
	
	SRC		r2
	INC		r3
	RDM
	XCH		r9
	
	SRC		r2
	RDM

;this code assumes it has no caller...it will overflow the stack repeatedly for profit
;assumes instr word is in A:r9:r10:r11:r12:r13:r14:r15	where A is top nibble, r9 is second-to-top, etc..
;get ((instr >> 26) << 1) into r6:r7, then jump based on it
	CLC
	RAR
	XCH		r6
	LD		r9
	RAR
	CLC
	RAR
	CLC
	RAL
	XCH		r7
	LDM		0
	DCL
	JIN		r6


cpuPrvNeg32:		;ptr in r2, clobbers r0
	LDM		8
	XCH		r0
	CLC
cpuPrvNeg32_loop:
	SRC		r2
	INC		r3
	LDM		0
	SBM
	WRM
	CMC
	ISZ		r0, cpuPrvNeg32_loop
	BBL		0

cpuPrvCopy8_r4r5_to_r2r3_DCL0to1:
	FIM		r6, 01H
	;fallthrough

cpuPrvCopy8_r4r5_to_r2r3_with_DCL:			;copy 8 nibbles from RAM at DCL_r6.r4:r5 to ram at DCL_r7.r2:r3. on it is assumed that the nibble addresses do not corss 16-boundary, clobbers r0 and r1
	FIM		r0, 88h
	;fallthrough

cpuPrvCopyX_r4r5_to_r2r3_with_DCL:			;copy X nibbles from RAM at DCL_r6.r4:r5 to ram at DCL_r7.r2:r3. on it is assumed that the nibble addresses do not corss 16-boundary. to copy X nibbles, set r0 to "0x0f & (16 - x)", to restore pointer back to original value, set r1 to same calue as r0
	;copy
cpuPrvCopy8_r4r5_to_r2r3_loop1:
	LD		r6
	DCL
	SRC		r4
	RDM
	XCH		r7
	DCL
	XCH		r7
	SRC		r2
	WRM
	INC		r5
	INC		r3
	ISZ		r0, cpuPrvCopy8_r4r5_to_r2r3_loop1

	;now fix r3 and r5
copy_loop_fix_ptrs:
	CLC
	LD		r3
	ADD		r1
	XCH		r3
	CLC
	LD		r5
	ADD		r1
	XCH		r5
	BBL		0

cpuPrvCopy8_r4r5_to_r2r3_currDCL:					;clobbbers r0, r1, restores pointers to prev state
	FIM		r0, 088H
cpuPrvCopyX_r4r5_to_r2r3_currDCL:
cpuPrvCopy8_r4r5_to_r2r3_currDCL_loop:
	SRC		r4
	INC		r5
	RDM
	SRC		r2
	INC		r3
	WRM
	ISZ		r0, cpuPrvCopy8_r4r5_to_r2r3_currDCL_loop
	JUN		copy_loop_fix_ptrs



cpuGetRegPtrS_r6r7:	;into r6:r7, S is at bit 21
	LD		r10
	RAL
	XCH		r7
	LD		r9
	RAL
	XCH		r7
	RAL
	XCH		r7
	RAL
	XCH		r6
	LD		r7
	RAL
	LDM		0
	RAR
	XCH		r7
	BBL		0

cpuGetRegPtrT_r6r7:
	LD		r10
	RAR
	LD		r11
	RAR
	XCH		r6
	LDM		0
	RAR
	XCH		r7
	BBL		0

cpuGetRegPtrT_r4r5:	;into r4:r5, T is at bit 16
	LD		r10
	RAR
	LD		r11
	RAR
	XCH		r4
	LDM		0
	RAR
	XCH		r5
	BBL		0

cpuGetRegPtrT_r2r3:	;into r2:r3, T is at bit 16
	LD		r10
	RAR
	LD		r11
	RAR
	XCH		r2
	LDM		0
	RAR
	XCH		r3
	BBL		0

cpuGetRegPtrD_r2r3_skip_if_zero:	;into r2:r3, D is at bit 11
	JMS		cpuGetRegPtrD_r2r3
dst_zero_check:
	LD		r2
	JCN		NZ, cpuGetRegPtrD_r2r3_skip_if_zero_pass
	LD		r3
	JCN		NZ, cpuGetRegPtrD_r2r3_skip_if_zero_pass
	JUN		next_instr
cpuGetRegPtrD_r2r3_skip_if_zero_pass:
	BBL		0

cpuGetRegPtrD_r2r3:	;into r2:r3, D is at bit 11
	LD		r12
	XCH		r2
	LD		r13
	RAL
	LDM		0
	RAR
	XCH		r3
	BBL		0

cpuGetRegPtrT_r4r5_skip_if_zero:	;into r2:r3, T is at bit 16
	JMS		cpuGetRegPtrT_r2r3
	JUN		dst_zero_check

cpuJumpCommon:									;pc = npc, inDelaySlot = 1
	LDM		1
	DCL

	FIM		r4, DCL1_NPC						;npc
	SRC		r4
	LDM		1
	WR0											;DCL1_IN_DELAY_SLOT

	;pc = npc
	FIM		r2, DCL1_PC										;pc
	JUN		cpuPrvCopy8_r4r5_to_r2r3_currDCL_noPtrRestore	;copy 8 nibbles from RAM at DCL_r6.r4:r5 to ram at DCL_r7.r2:r3

cpuBranchToAddr:								;pc = npc, inDelaySlot = 1, npc = DCL1_TMP32C. MUST end with DCL1
	JMS		cpuJumpCommon

	;npc = <instr>
	FIM		r4, DCL1_TMP32C
	FIM		r2, DCL1_NPC
	JUN		cpuPrvCopy8_r4r5_to_r2r3_currDCL_noPtrRestore

cpuBranchToRegS:								;pc = npc, inDelaySlot = 1, npc = regS. MUST end with DCL1
	JMS		cpuJumpCommon

	;npc = regS
	JMS		cpuGetRegPtrS_r4r5
	FIM		r6, 01H								;DCL0 to DCL1 copy
	FIM		r2, DCL1_NPC
	JUN		cpuPrvCopy8_r4r5_to_r2r3_with_DCL	;copy 8 nibbles from RAM at DCL_r6.r4:r5 to ram at DCL_r7.r2:r3


instr_sll_imm:
	JMS		cpuGetRegNumA_r6r7				;into r6:r7

instr_sll_have_imm:
	JMS		cpuGetRegPtrD_r2r3_skip_if_zero	;into r2:r3
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	
	LD		r6
	RAR
	LD		r7
	RAR
	CLC
	RAR
	JCN		Z, instr_sll_shift_less_than_by_4	;at jump dest, r7 = num_bits_to_shift_by_after_nibble_shifting_is_done (since r6:r7 was under 4, this is true still) 
	XCH		r7									;r7 = num_nibbles_to_shift_by
	RAL
	RAL
	CLC
	RAR
	CLC
	RAR
	XCH		r7									;r7 = num_bits_to_shift_by_after_nibble_shifting_is_done, A = num_nibbles_to_shift_by

instr_sll_shift_by_nibbles:						;A = "N" = num nibbles to shift by is 1..7
	
	XCH		r6									;save N
	CLC
	LDM		8
	ADD		r6
	XCH		r8									;ISZ count for copying
	CLB
	SUB		r6									;leaves C = 0
	XCH		r9									;ISZ count for zeroing
	LDM		7
	ADD		r3									;leaves C = 0
	XCH		r3									;r2:r3 = copy DST
	LDM		7
	ADD		r5									;leaves C = 0
	SUB		r6
	XCH		r5									;r4:r5 = copy start


instr_sll_copy_loop:
	SRC		r4
	LD		r5
	DAC
	XCH		r5
	RDM
	SRC		r2
	WRM
	LD		r3
	DAC
	XCH		r3
	ISZ		r8, instr_sll_copy_loop

	LDM		0
instr_sll_zero_loop:
	SRC		r2
	WRM
	XCH		r3
	DAC
	XCH		r3
	ISZ		r9, instr_sll_zero_loop
	INC		r3
	LD		r3

instr_sll_bitwise:								;r2:A (YES) points to DST LSB
	XCH		r4
	LD		r7
	JCN		Z, instr_sll_done
	CMA
	IAC
	XCH		r7									;ISZ counter for shifting left

instr_sll_bitwise_outer:
	LD		r4
	XCH		r3
	LDM		8
	XCH		r5
	CLC

instr_sll_bitwise_inner:
	SRC		r2
	INC		r3
	RDM
	RAL
	WRM
	ISZ		r5, instr_sll_bitwise_inner
	ISZ		r7, instr_sll_bitwise_outer

instr_sll_done:
	JUN		next_instr


instr_sll_shift_less_than_by_4:
	JMS		cpuPrvCopy8_r4r5_to_r2r3_currDCL_noPtrRestore
	LD		r3
	RAL
	LDM		0
	RAR
	JUN		instr_sll_bitwise


instr_srX_imm:
	JMS		cpuGetRegNumA_r6r7				;into r6:r7

instr_srl_have_imm:
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5

	LDM		0
	XCH		r14								;r14[bit 0] = value to put into missing bits (allows reusing this code for SRA). we know r14 from the instr value will no longer be used 

instr_srX_imm_common:
	JMS		cpuGetRegPtrD_r2r3_skip_if_zero	;into r2:r3

	LD		r6
	RAR
	LD		r7
	RAR
	CLC
	RAR
	JCN		Z, instr_srX_shift_less_than_by_4	;at jump dest, r7 = num_bits_to_shift_by_after_nibble_shifting_is_done (since r6:r7 was under 4, this is true still) 
	XCH		r7									;r7 = num_nibbles_to_shift_by
	RAL
	RAL
	CLC
	RAR
	CLC
	RAR
	XCH		r7									;r7 = num_bits_to_shift_by_after_nibble_shifting_is_done, A = num_nibbles_to_shift_by

instr_srX_shift_by_nibbles:
	CLC
	ADD		r5
	XCH		r5									;r4:r5 = copy_src, r2:r3 = copy dst
	LD		r5
	RAL
	STC
	RAR
	XCH		r6									;r6 = ISZ counter for copying
	LDM		8
	CLC
	SUB		r6
	XCH		r8									;r8 = ISZ counter for filling

	
instr_srX_shift_by_nibbles_copy_loop:
	SRC		r4
	INC		r5
	RDM
	SRC		r2
	INC		r3
	WRM
	ISZ		r6, instr_srX_shift_by_nibbles_copy_loop

	LD		r14
	CMA
	IAC											;A = fill NIBBLE

instr_srX_shift_by_nibbles_fill_loop:
	SRC		r2
	INC		r3
	WRM
	ISZ		r8, instr_srX_shift_by_nibbles_fill_loop
	
	;calc pointer to highest nibble in the word
	LD		r3
	DAC											;A = low nibble of pointer to high nibble ofr number

instr_srX_shift_remainder:						;we get here with r2:A (YES this is correct) pointing TO the high byte of dst, r7 = num_bits_to_shift_by
	XCH		r4
	LD		r7
	JCN		Z, instr_srX_done
	CMA
	IAC
	XCH		r7									;r7 = ISZ counter for the shift

instr_srX_shift_outer:
	LD		r4
	XCH		r3
	LDM		16 - 8
	XCH		r5
	LD		r14
	RAR

instr_srX_shift_inner:
	SRC		r2
	RDM
	RAR
	WRM
	TCC
	XCH		r3
	DAC
	XCH		r3
	RAR
	ISZ		r5, instr_srX_shift_inner
	ISZ		r7, instr_srX_shift_outer

instr_srX_done:
	JUN		next_instr

instr_srX_shift_less_than_by_4:
	JMS		cpuPrvCopy8_r4r5_to_r2r3_currDCL_noPtrRestore
	LD		r3									;pointer was advande by 7 as needed
	JUN		instr_srX_shift_remainder


instr_sra_imm:	;see if negative. if no, just go to old handler
	JMS		cpuGetRegNumA_r6r7				;into r6:r7

instr_sra_have_imm:
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	CLC						;SRC (r4:r5 + 7)
	LDM		7
	ADD		r5
	XCH		r5
	SRC		r4
	XCH		r5				;restore r5
	RDM
	RAL		;C is now 1 if number is negative, 0 if positive
	TCC		;A is now 1 if number is negative, 0 if positive
	XCH		r14							;r8 = value to put into missing bytes/bits (allows reusing this code for SRA). we know r8 from the instr value will no longer be used 
	JUN		instr_srX_imm_common




	org		0300H
;64-entry jumptable based on bottom 6 bits of instr for instrs whose top 6 bits ar 0b000000
	JUN		instr_sll_imm
	JUN		instr_undef		;1
	JUN		instr_srX_imm
	JUN		instr_sra_imm
	JUN		instr_sllv
	JUN		instr_undef		;5
	JUN		instr_srlv
	JUN		instr_srav
	JUN		instr_jr
	JUN		instr_jalr
	JUN		instr_undef		;10 = movz
	JUN		instr_undef		;11 = movn
	JUN		instr_syscall
	JUN		instr_break
	JUN		instr_undef		;14
	JUN		next_instr		;sync
	JUN		instr_mfhi
	JUN		instr_mthi
	JUN		instr_mflo
	JUN		instr_mtlo
	JUN		instr_undef		;20
	JUN		instr_undef		;21
	JUN		instr_undef		;22
	JUN		instr_undef		;23
	JUN		instr_mult
	JUN		instr_multu
	JUN		instr_div
	JUN		instr_divu
	JUN		instr_undef		;28
	JUN		instr_undef		;29
	JUN		instr_undef		;30
	JUN		instr_undef		;31
	JUN		instr_add
	JUN		instr_addu
	JUN		instr_sub
	JUN		instr_subu
	JUN		instr_and
	JUN		instr_or
	JUN		instr_xor
	JUN		instr_nor
	JUN		instr_undef		;40
	JUN		instr_undef		;41
	JUN		instr_slt
	JUN		instr_sltu
	JUN		instr_undef		;44
	JUN		instr_undef		;45
	JUN		instr_undef		;46
	JUN		instr_undef		;47
	JUN		instr_tge
	JUN		instr_tgeu
	JUN		instr_tlt
	JUN		instr_tlu
	JUN		instr_teq
	JUN		instr_undef		;53
	JUN		instr_tne
	JUN		instr_undef		;55
	JUN		instr_undef		;56
	JUN		instr_undef		;57
	JUN		instr_undef		;58
	JUN		instr_undef		;59
	JUN		instr_undef		;60
	JUN		instr_undef		;61
	JUN		instr_undef		;62
	JUN		instr_undef		;63





instr_top_lvl_0:
;assumes instr word is in r8:r9:r10:r11:r12:r13:r14:r15
;get ((instr & 0x3f) << 1) into r6:r7, then jump based on it
	LD		r15
	CLC
	RAL
	XCH		r7
	LD		r14
	RAL
	CLC
	RAL
	CLC
	RAR
	XCH		r6
	JIN		r6




instr_sXXv_getShiftAmt:					;get the actual imm into r6:r7, like cpuGetRegNumA_r6r7 would have
	JMS		cpuGetRegPtrS_r6r7
	SRC		r6
	INC		r7
	RDM
	SRC		r6
	XCH		r7
	RDM
	RAR
	TCC
	XCH		r6
	BBL		0

instr_sllv:
	JMS		instr_sXXv_getShiftAmt
	JUN		instr_sll_have_imm

instr_srlv:
	JMS		instr_sXXv_getShiftAmt
	JUN		instr_srl_have_imm

instr_srav:
	JMS		instr_sXXv_getShiftAmt
	JUN		instr_sra_have_imm

instr_jr:
	JMS		cpuBranchToRegS
	JUN		emu_cycle		

instr_mfhi:
	JMS		cpuGetRegPtrD_r2r3_skip_if_zero	;into r2:r3
	FIM		r4, DCL1_HI							;src = HI
	JUN		copy_8_DCL1_to_DCL0_and_next_instr

instr_mflo:
	JMS		cpuGetRegPtrD_r2r3_skip_if_zero	;into r2:r3
	FIM		r4, DCL1_LO
	;fallthrough

copy_8_DCL1_to_DCL0_and_next_instr:
	FIM		r6, 10H								;DCL1 to DCL0 copy
	;fallthrough

copy_8_and_next_instr:
	JMS		cpuPrvCopy8_r4r5_to_r2r3_with_DCL	;copy 8 nibbles from RAM at DCL_r6.r4:r5 to ram at DCL_r7.r2:r3
	JUN		next_instr						;src = LO

instr_mthi:
	JMS		cpuGetRegPtrS_r4r5
	FIM		r2, DCL1_HI							;dst = HI

copy_8_DCL0_to_DCL1_and_next_instr:
	JMS		cpuPrvCopy8_r4r5_to_r2r3_DCL0to1	;copy 8 nibbles from RAM at DCL_r6.r4:r5 to ram at DCL_r7.r2:r3
	JUN		next_instr							;src = LO

instr_mtlo:
	JMS		cpuGetRegPtrS_r4r5
	FIM		r2, DCL1_LO							;dst = LO
	JUN		copy_8_DCL0_to_DCL1_and_next_instr

take_exc_normal_vec:
	LDM		EXC_VEC_NORMAL
	;fallthrough

take_exc_with_vec:				;vector in A
	;fallthroguh

cpuTakeException:	;code in r0:r1's bits 2..6, A = vectorADDR / 16, clobbers many things

	;stash vectorADDR
	XCH		r15

	LDM		1
	DCL

	;prepare for the below
	FIM		r2, DCL1_NPC
	SRC		r2
	
	;taking an exception masks IRQs
	LDM		0
	WR1				;DLC1_IRQPENDING

	;cpu.cause.BD = cpu.inDelaySlot; cpu.inDelaySlot = 0;
	RD0				;DCL1_IN_DELAY_SLOT
	RAR
	WRM
	RAL		;A = wasInDelaySlot
	FIM		r2, DCL1_CAUSE + 7
	SRC		r2
	XCH		r3		;r3 = was_inDelaySlot
	RDM				;A = CAUSE's top nibble
	RAL
	XCH		r3		
	RAR
	XCH		r3
	RAR
	WRM
	XCH		r14		;top bit = "was In delay slot"


	;cpu.cause.EXC_COD = excCode
	FIM		r2, DCL1_CAUSE
	SRC		r2
	LD		r1
	WRM
	INC		r3
	SRC		r2
	LD		r0
	WRM

	;cpu.epc = cpu.pc
	FIM		r4, DCL1_PC
	FIM		r2, DCL1_EPC
	JMS		cpuPrvCopy8_r4r5_to_r2r3_currDCL_noPtrRestore

	;if (wasInDelaySlot) cpu.epc -= 4
	LD		r14
	RAL
	JCN		NC, exc_was_not_in_delay_slot
	
	LDM		4
	FIM		r2, DCL1_EPC
	JMS		cpuSubNibble
exc_was_not_in_delay_slot:

	;cpu.pc=0x80000000
	LDM		9
	XCH		r0
	FIM		r4, DCL1_PC
	JMS		cpuPrvZeroFillEx
	LDM		8
	SRC		r4
	WRM

	;cpu.pc += vectorADDR << 4
	FIM		r4, DCL1_PC + 1
	SRC		r4
	LD		r15
	WRM

	;npc = pc
	FIM		r4, DCL1_PC
	FIM		r2, DCL1_NPC
	JMS		cpuPrvCopy8_r4r5_to_r2r3_currDCL_noPtrRestore	;copy 8 nibbles from RAM at DCL_r6.r4:r5 to ram at DCL_r7.r2:r3

	;npc += 4
	LDM		4
	FIM		r2, DCL1_NPC
	JMS		cpuAddNibble

	;cpu.status =
	;	(cpu.status &~ (CP0_STATUS_KUO | CP0_STATUS_IEO | CP0_STATUS_KUP | CP0_STATUS_IEP | CP0_STATUS_KUC | CP0_STATUS_IE)) |
	;	((cpu.status & (CP0_STATUS_KUP | CP0_STATUS_IEP | CP0_STATUS_KUC | CP0_STATUS_IE)) << 2);
	;AKA: cpu.status = (cpu.status &~ 0x3f) | (cpu.status & 0x0f) << 2)
	
	LDM		16 - 2
	XCH		r3

	;STA HAD:		???? ???? ???? ???? ???? ???? abcd efgh
	;STA WANT:		???? ???? ???? ???? ???? ???? abef gh00

	FIM		r0, DCL1_STATUS
	SRC		r0
	RDM				;c = ?, a = efgh, r2 = ????
	XCH		r2		;c = ?, a = ????, r2 = efgh
	INC		r1
	RDM				;c = ?, a = abcd, r2 = efgh
	CLC				;c = 0, a = abcd, r2 = efgh
	RAR				;c = d, a = 0abc, r2 = efgh
	CLC				;c = 0, a = 0abc, r2 = efgh
	RAR				;c = c, a = 00ab, r2 = efgh
	CLC				;c = 0, a = 00ab, r2 = efgh

	XCH		r2		;c = 0, a = efgh, r2 = 00ab, r3 = -2
	RAL				;c = e, a = fgh0, r2 = 00ab, r3 = -2
	XCH		r2		;c = e, a = 00ab, r2 = fgh0, r3 = -2
	RAL				;c = 0, a = 0abe, r2 = fgh0, r3 = -2

	XCH		r2		;c = 0, a = fgh0, r2 = 0abe, r3 = -1
	RAL				;c = f, a = gh00, r2 = 0abe, r3 = -1
	XCH		r2		;c = f, a = 0abe, r2 = gh00, r3 = -1
	RAL				;c = 0, a = abef, r2 = gh00, r3 = -1

	WRM
	FIM		r0, DCL1_STATUS
	SRC		r0
	LD		r2
	WRM
	JUN		emu_cycle



divu_common_head:
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		divZeroCheck

	;tmp64a.lo = T			//denom
	FIM		r2, DCL1_TMP64A
	JMS		cpuPrvCopy8_r4r5_to_r2r3_DCL0to1

	;hi = S					//num
	JMS		cpuGetRegPtrS_r4r5
	FIM		r2, DCL1_HI
	JUN		cpuPrvCopy8_r4r5_to_r2r3_DCL0to1


instr_divu:	;lo = S / T, hi = S % T, unsigned
	JMS		divu_common_head
	JMS		cpuPrvDivu_veneer
	JUN		next_instr

instr_div:	;lo = S / T, hi = S % T, signed
	JMS		divu_common_head

	FIM		r2, DCL1_HI + 7
	SRC		r2
	RDM
	RAL
	TCC
	XCH		r14		;r14 = "numerator_was_num_negative" = must_invert_remainder_sign
	LD		r14
	JCN		Z, num_sign_handled

num_is_negative:
	FIM		r2, DCL1_HI
	JMS		cpuPrvNeg32
num_sign_handled:

	FIM		r2, DCL1_TMP64A + 7
	SRC		r2
	RDM
	RAL
	JCN		C, denom_is_negative

denom_is_positive:
	LD		r14
	XCH		r15		;r15 = "must_invert_quotient_sign" = numerator_was_num_negative
	JUN		denom_sign_handled

denom_is_negative:
	CLC
	LDM		1
	SUB		r14
	XCH		r15		;r15 = "must_invert_quotient_sign" = NOT numerator_was_num_negative
	FIM		r2, DCL1_TMP64A
	JMS		cpuPrvNeg32

denom_sign_handled:
	JMS		cpuPrvDivu_veneer

	LD		r14
	JCN		Z, quotient_sign_handled
div_negate_quotient:
	FIM		r2, DCL1_LO
	JMS		cpuPrvNeg32
quotient_sign_handled:

	LD		r15
	JCN		Z, remainder_sign_handled
div_negate_remainder:
	FIM		r2, DCL1_HI
	JMS		cpuPrvNeg32
remainder_sign_handled:
	JUN		next_instr


instr_jalr:
	JMS		cpuBranchToRegS							;this moves NPC to PC, thus the logic below that adds 4 to "PC" is correct
	JMS		cpuGetRegPtrD_r2r3						;into r2:r3
	LD		r2
	JCN		NZ, jalr_save_ret_addr
	LD		r3
	JCN		NZ, jalr_save_ret_addr
	JUN		jalr_addr_saved

jalr_save_ret_addr:
	;Rd = PC
	FIM		r4, DCL1_PC							;src = pc
	FIM		r6, 10H								;DCL1 to DCL0 copy
	JMS		cpuPrvCopy8_r4r5_to_r2r3_with_DCL	;copy 8 nibbles from RAM at DCL_r6.r4:r5 to ram at DCL_r7.r2:r3
	
	;Rd += 4
	LDM		4
	JMS		cpuAddNibble						;add A to u32 addressed by r2:r3, pointer is destroyed, r0 is clobbered

jalr_addr_saved:
	LDM		1
	DCL											;emu_cycle needs ot be entered in DCL1
	JUN		emu_cycle

prvCopyr2r3_to_r4r5:
	LD		r2
	XCH		r4
	LD		r3
	XCH		r5
	BBL		0

prvSwapr2r3_with_r12r13:
	LD		r2
	XCH		r12
	XCH		r2
	LD		r3
	XCH		r13
	XCH		r3
	BBL		0

prvSwapr2r3_with_r4r5:
	LD		r2
	XCH		r4
	XCH		r2
	LD		r3
	XCH		r5
	XCH		r3
	BBL		0

instr_multu:

	FIM		r12, 001H					;DCL=1, signed = 0

instr_multX:

	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	FIM		r2, DCL1_TMP32B
	JMS		cpuPrvCopy8_r4r5_to_r2r3_DCL0to1

	JMS		cpuGetRegPtrS_r4r5
	FIM		r2, DCL1_TMP32A
	JMS		cpuPrvCopy8_r4r5_to_r2r3_DCL0to1

	JMS		cpuPrvMultX_veneer
	JUN		next_instr

instr_mult:
	FIM		r12, 011H					;DCL=1, signed = 1
	JUN		instr_multX
	
cpuAddNibble:									;add A to u32 addressed by r2:r3, pointer is destroyed, r0 is clobbered
	XCH		r0
	LDM		16 - 4
	XCH		r0
	CLC

cpuAddNibble_loop:
	SRC		r2
	INC		r3
	ADM
	WRM
	JCN		NC, cpuAddNibble_shortcutOut
	LDM		0
	SRC		r2
	INC		r3
	ADM
	WRM
	JCN		NC, cpuAddNibble_shortcutOut
	LDM		0
	ISZ		r0, cpuAddNibble_loop
	
cpuAddNibble_shortcutOut:
	BBL		0

cpuSubNibble:									;sub A from u32 addressed by r2:r3, pointer is destroyed, r0 is clobbered
	XCH		r0
	LDM		8
	XCH		r0
	STC
	CMA
cpuSubNibble_loop:
	SRC		r2
	INC		r3
	ADM
	WRM
	LDM		15
	ISZ		r0, cpuSubNibble_loop
	BBL		0


cpuPrvCopy8_r4r5_to_r2r3_currDCL_noPtrRestore:		;clobbbers r0, r1, pointers
	SRC		r4
	INC		r5
	RDM

	SRC		r2
	INC		r3
	WRM
	
	SRC		r4
	INC		r5
	RDM

	SRC		r2
	INC		r3
	WRM
	
	SRC		r4
	INC		r5
	RDM

	SRC		r2
	INC		r3
	WRM

cpuPrvCopy5_r4r5_to_r2r3_currDCL_noPtrRestore:		;clobbbers r0, r1, pointers
	SRC		r4
	INC		r5
	RDM

	SRC		r2
	INC		r3
	WRM
	
	SRC		r4
	INC		r5
	RDM

	SRC		r2
	INC		r3
	WRM
	
	SRC		r4
	INC		r5
	RDM

	SRC		r2
	INC		r3
	WRM
	
	SRC		r4
	INC		r5
	RDM

	SRC		r2
	INC		r3
	WRM
	
	SRC		r4
	RDM

	SRC		r2
	WRM
	
	BBL		0


instr_undef:
	FIM		r0, CP0_EXC_COD_RI * 4			;reserved instruction trap
	JUN		take_exc_normal_vec


cpuPrvTwoOperandAdd:		;add. r4:r5 = what, r2:r3 = dst, clobbeers r0, moves pointers by 7
	LDM		9
	XCH		r0
	CLC

cpuPrvTwoOperandAdd_inner:
	SRC		r4
	INC		r5
	RDM
	SRC		r2
	INC		r3
	ADM
	WRM
	ISZ		r0, cpuPrvTwoOperandAdd_inner
	SRC		r4
	RDM
	SRC		r2
	ADM
	WRM
	BBL		0



cpuPrvCalcOvf:					;r0 is result of last ADD/SUB op, r6:r7 points at LHS's nibble 7, r4:r5 points ar RHS's nibble 7, C is set if this was a sub
	;overflow requires that output sign not math LHS sign

	;r0 = result_of_last_alu_op
	TCC
	XCH		r0		;r0 = "we had sub", A = result_of_last_alu_op
	RAL
	LDM		0
	RAR				;A = result_of_last_alu_op & 8, C = 0
	SRC		r6
	ADM				;C = 1 IFF "result_of_last_alu_op had high bit set and so did LHS", else: top bit is zero IFF "result_of_last_alu_op had high bit clr and so did LHS"
	JCN		C, cpuPrvHelpCalcOvf_lhs_and_result_both_negative
	RAL
	JCN		C, cpuPrvHelpCalcOvf_lhs_and_result_diff_signs

cpuPrvHelpCalcOvf_lhs_and_result_both_negative:
cpuPrvHelpCalcOvf_lhs_and_result_both_positive:		;means we did not overflow
cpuPrvHelpCalcOvf_no_ovf:
	CLC
	BBL		0
	
cpuPrvHelpCalcOvf_lhs_and_result_diff_signs:		;add: ovf if lhs and rhs signs are same. sub: ovf if lhs and rhs signs differ
	RDM
	RAL
	TCC
	ADD		r0
	XCH		r0
	SRC		r4
	RDM
	RAL
	TCC
	ADD		r0
	RAR
	JCN		NZ, cpuPrvHelpCalcOvf_no_ovf
cpuPrvHelpCalcOvf_have_ovf:
	STC
	BBL		0

instr_tge:
	JMS		cpuGetRegPtrT_r6r7
	JMS		cpuGetRegPtrS_r4r5
tlt_common:
	JMS		cpuPrvLt
tcc_common:
	JCN		C, take_int_ovf_exc
	JUN		next_instr

take_int_ovf_exc:
	FIM		r0, CP0_EXC_COD_OV * 4
	JUN		take_exc_normal_vec

add_common_ovf_handling:
	CLC
	JMS		cpuPrvCalcOvf
	JCN		C, take_int_ovf_exc
	INC		r5	;so rewind works
	INC		r7
	BBL		0

instr_add:
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7

	;begin CMN
	LDM		9
	XCH		r0
	CLC

cpuPrvCmn_inner:
	SRC		r6
	INC		r7
	RDM
	SRC		r4
	INC		r5
	ADM
	ISZ		r0, cpuPrvCmn_inner
	SRC		r6
	RDM
	SRC		r4
	ADM
	XCH		r0
	;end CMN

	JMS		add_common_ovf_handling
	JMS		cpuGetRegPtrD_r2r3_skip_if_zero	;into r2:r3
	JMS		cpuPrvRewindTwoRegPtrs
	JMS		cpuPrvThreeOperandAdd
	JUN		next_instr

instr_sub:
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	JMS		cpuPrvCmp					;leaves C in proper state, leaves top nibble of result in r0, also moves pointers by 7
	STC
	JMS		cpuPrvCalcOvf
	JCN		C, take_int_ovf_exc
	JMS		cpuGetRegPtrD_r2r3_skip_if_zero	;into r2:r3
	INC		r5	;so remind works
	INC		r7
	JMS		cpuPrvRewindTwoRegPtrs
	JMS		cpuPrvThreeOperandSub
	JUN		next_instr
cop0_write_context:
	FIM		r4, DCL1_CONTEXT
	JUN		cop0_write_fullreg

cop0_read_badVA:
	FIM		r4, DCL1_BADVA
	JUN		copy_8_DCL1_to_DCL0_and_next_instr

cpuPrvEq:	;calc r6:r7 == r4:r5. return bool in C, clobbers r0
	LDM		8
	XCH		r0
cpuPrvEq_loop:
	SRC		r6
	INC		r7
	RDM
	SRC		r4
	INC		r5
	CLC
	SBM
	JCN		NZ, cpuPrvEq_fails
	ISZ		r0, cpuPrvEq_loop
	STC
	BBL		0
cpuPrvEq_fails:
	CLC
	BBL		0

cpuPrvLtu:	;calc unsigned r6:r7 < r4:r5. return bool in C, clobbers r0
	JMS		cpuPrvCmp					;leaves C in proper state, leaves top nibble of result in r0, also moves pointers by 7
	;we want to produce 1 on carry == on borrow
	CMC
	BBL		0

cpuPrvLt:	;calc signed r6:r7 < r4:r5. return bool in C, clobbers r0, r1
			;signed compare is same as unsigned, except top bits need to be inverted
	LDM		9
	XCH		r0
	CLC

cpuPrvLt_inner:
	SRC		r6
	INC		r7
	RDM
	SRC		r4
	INC		r5
	SBM
	CMC
	ISZ		r0, cpuPrvLt_inner
	
	TCC				;"carry" == not borrow" from last sub
	XCH		r0		;stash it
	SRC		r6		;load top nibble of LHS
	RDM
	RAL
	CMC
	RAR				;flip its top bit
	XCH		r1		;stash into r1
	SRC		r4		;load top nibble of RHS
	RDM
	RAL
	CMC
	RAR
	XCH		r0		;stash it into r0, A is now not carry from last sub above
	RAR				;move that into C
	LD		r1		
	SUB		r0
	CMC
	BBL		0

instr_addu:
	JMS		cpuGetRegPtrD_r2r3_skip_if_zero	;into r2:r3, safe to bail early since ADDU cannot cause a trap
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	JMS		cpuPrvThreeOperandAdd
	JUN		next_instr

instr_subu:
	JMS		cpuGetRegPtrD_r2r3_skip_if_zero	;into r2:r3, safe to bail early since ADDU cannot cause a trap
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	JMS		cpuPrvThreeOperandSub
	JUN		next_instr

instr_slt:
	JMS		cpuGetRegPtrD_r2r3_skip_if_zero	;into r2:r3
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	JMS		cpuPrvLt
	JUN		set_dst_reg_simple_val_from_C

instr_sltu:
	JMS		cpuGetRegPtrD_r2r3_skip_if_zero	;into r2:r3
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	JMS		cpuPrvLtu

set_dst_reg_simple_val_from_C:
	LDM		0
	DCL
	TCC

	;val in carry now
	SRC		r2
	WRM
	LDM		9
	XCH		r4
	LDM		0
set_dst_reg_simple_val_loop:
	INC		r3
	SRC		r2
	WRM
	ISZ		r4, set_dst_reg_simple_val_loop
	JUN		next_instr


instr_tlt:
	JMS		cpuGetRegPtrT_r4r5
	JMS		cpuGetRegPtrS_r6r7
	JUN		tlt_common

instr_tgeu:
	JMS		cpuGetRegPtrT_r6r7
	JMS		cpuGetRegPtrS_r4r5
tltu_common:
	JMS		cpuPrvLtu
	JUN		tcc_common

instr_tlu:
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	JUN		tltu_common

instr_teq:
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	JMS		cpuPrvEq
	JUN		tcc_common

instr_tne:
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	JMS		cpuPrvEq
	CMC
	JUN		tcc_common


cpuPrvZeroFill8:	;r4:r5 = ptr, assumes pointer does not go over 16 byte boundary. advances pointer
	LDM		8
	XCH		r0
	;fallthrough
	
cpuPrvZeroFillEx:	;r4:r5 = ptr, r0 = 15 & (16 - len). assumes pointer does not go over 16 byte boundary. advances pointer.
	LDM		0
cpuPrvZeroFillEx_loop:
	SRC		r4
	INC		r5
	WRM
	ISZ		r0, cpuPrvZeroFillEx_loop
	BBL		0


cpuPrvRelBranch:
	LDM		1
	DCL
	FIM		r0,	DCL1_TMP32C + 3
	JMS		cpuPrvRegToSimm16
	LDM		16 - 5
	JMS		cpuPrvTmp32cLsl2
	FIM		r4, DCL1_NPC
	FIM		r2, DCL1_TMP32C
	JMS		cpuPrvTwoOperandAdd
	JMS		cpuBranchToAddr
	JUN		emu_cycle

cpuPrvRegToSimm16:						;r0:r1 = reg + 3, clobbers r6, expects proper DCL for the data
	SRC		r0
	INC		r1
	RDM
	RAL
cpuPrvInstrToXimm16:					;fill value is in C, r0:r1 is reg + 4, expects proper DCL for the data
	LDM		16 - 4
	XCH		r6
	TCC		;0x01 if we need to fill with 0xff, 0x00 if we need to fill with 0x00
	CMA		;0xfe if we need to fill with 0xff, 0xff if we need to fill with 0x00
	IAC		;0xff if we need to fill with 0xff, 0x00 if we need to fill with 0x00
cpuPrvInstrToXimm16_fill:
	SRC		r0
	INC		r1
	WRM
	ISZ		r6, cpuPrvInstrToXimm16_fill
	BBL		0


instr_bltzal:
	JMS		cpuPrvSaveRetAddr
	;fallthrough
instr_bltz:
	JMS		cpuPrvGetTopBitOfRegS
	JCN		C, cpuPrvRelBranch
	JUN		next_instr

instr_bgezal:
	JMS		cpuPrvSaveRetAddr
	;fallthrough
instr_bgez:
	JMS		cpuPrvGetTopBitOfRegS
	JCN		NC, cpuPrvRelBranch
	JUN		next_instr

instr_bltzall:
	JMS		cpuPrvSaveRetAddr
	;fallthrough
instr_bltzl:
	JMS		cpuPrvGetTopBitOfRegS
	JCN		C, cpuPrvRelBranch
	JUN		skip_next_instr

instr_bgezall:
	JMS		cpuPrvSaveRetAddr
	;fallthrough
instr_bgezl:
	JMS		cpuPrvGetTopBitOfRegS
	JCN		NC, cpuPrvRelBranch
	JUN		skip_next_instr

instr_or:
	JMS		cpuGetRegPtrD_r2r3_skip_if_zero	;into r2:r3
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	LD		r4
	JCN		NZ, instr_or_real
	LD		r5
	JCN		NZ, instr_or_real

	;gcc likes to use ORR with $0 as a MOV - speed that up
instr_or_is_mov:
	LD		r4
	XCH		r6
	XCH		r4
	LD		r5
	XCH		r7
	XCH		r5
	JMS		cpuPrvCopy8_r4r5_to_r2r3_currDCL_noPtrRestore
	JUN		next_instr

instr_or_real:
	FIM		r12, 000H					;DCL=0
	JMS		cpuPrvThreeOperandOrr_veneer
	JUN		next_instr


cpuPrvCmp:					;subtract. r6:r7 = whence, r4:r5 = what, clobbeers r0, leaves C in proper state ("not borrow"), leaves top nibble of result in A, pointers moved by 7
	LDM		9
	XCH		r0
	CLC

cpuPrvCmp_inner:
	SRC		r6
	INC		r7
	RDM
	SRC		r4
	INC		r5
	SBM
	CMC
	ISZ		r0, cpuPrvCmp_inner
	SRC		r6
	RDM
	SRC		r4
	SBM
	XCH		r0
	BBL		0

cpuPrvThreeOperandSub:		;subtract. r6:r7 = whence, r4:r5 = what, r2:r3 = dst, clobbeers r0, leaves C in proper state
	LDM		8
	XCH		r0
	CLC

cpuPrvThreeOperandSub_inner:
	SRC		r6
	INC		r7
	RDM
	SRC		r4
	INC		r5
	SBM
	CMC
	SRC		r2
	INC		r3
	WRM
	ISZ		r0, cpuPrvThreeOperandSub_inner
	BBL		0

cop0_write_badVA:
	FIM		r4, DCL1_BADVA
	JUN		cop0_write_fullreg


cpuPrvIrqsRecalc_none:
	LDM		0

cpuPrvIrqsRecalc_write:
	FIM		r4, DCL1_PC
	SRC		r4
	WR1												;DLC1_IRQPENDING
	BBL		0

cpuPrvIrqsRecalc:									;call with DCL = 1
	FIM		r4, DCL1_STATUS							;they could be masked
	SRC		r4
	RDM
	RAR
	JCN		NC, cpuPrvIrqsRecalc_none

	FIM		r2, DCL1_CAUSE + 3						;point to cause's bit 12
	FIM		r4, DCL1_STATUS + 3						;point to status's bit 12

	SRC		r2
	RDM
	XCH		r2										;r2 = {cause[15], cause[14], cause[13], cause[12]}
	SRC		r4
	RDM
													;A = {status[15], status[14], status[13statuscause[12]}
	RAR												;A = {?, status[15], status[14], status[13]}, C = status[12]
	XCH		r3										;r3 = {?, status[15], status[14], status[13]}, A = ?
	TCC												;A = status[12], C = 0
	XCH		r2										;r2 = status[12], A={cause[15], cause[14], cause[13], cause[12]}
	RAR												;A={?, cause[15], cause[14], cause[13]}, C = cause[12]
	XCH		r2										;A = status[12], r2={?, cause[15], cause[14], cause[13]}, C = cause[12]
	
	JCN		NC, irq_check_next
	JCN		Z, irq_check_next

cpuPrvIrqsRecalc_yes:
	LDM		1
	JUN		cpuPrvIrqsRecalc_write

irq_check_next:
	LD		r2										;A = {?, cause[15], cause[14], cause[13]}
	RAR												;A={?, ?, cause[15], cause[14]}, C = cause[13]
	TCC												;A = cause[13], C = 0
	XCH		r3										;r3 = cause[13], A = {?, status[15], status[14], status[13]}
	RAR												;A={?, ?, status[15], status[14]}, C=status[13]
	LD		r3										;A = cause[13], C=status[13]

	JCN		NC, cpuPrvIrqsRecalc_none
	JCN		Z, cpuPrvIrqsRecalc_none
	JUN		cpuPrvIrqsRecalc_yes



spiRamSendNibble:	;A = nibble, clobbers r6, r10 = proper chip select bits plus 3
	CMA			;we need inverted data, invert it once

	RAL
	XCH		r6
	LD		r10	;chip select low, clock low, data bit shown
	RAL
	WMP
	DAC
	DAC			;chip select low, clock high, data bit shown
	WMP

	XCH		r6
	RAL
	XCH		r6
	LD		r10	;chip select low, clock low, data bit shown
	RAL
	WMP
	DAC
	DAC			;chip select low, clock high, data bit shown
	WMP

	XCH		r6
	RAL
	XCH		r6
	LD		r10	;chip select low, clock low, data bit shown
	RAL
	WMP
	DAC
	DAC			;chip select low, clock high, data bit shown
	WMP

	XCH		r6
	RAL
	LD		r10	;chip select low, clock low, data bit shown
	RAL
	WMP
	DAC
	DAC			;chip select low, clock high, data bit shown
	WMP

	BBL		0


cop0_read_epc:
	FIM		r4, DCL1_EPC
	JUN		copy_8_DCL1_to_DCL0_and_next_instr

	org		0700H
;32-entry jumptable based on 5 bits at offset 16 of instr for instrs whose top 6 bits ar 0b000001
	JUN		instr_bltz
	JUN		instr_bgez
	JUN		instr_bltzl
	JUN		instr_bgezl
	JUN		instr_undef		;4
	JUN		instr_undef		;5
	JUN		instr_undef		;6
	JUN		instr_undef		;7
	JUN		instr_tgei
	JUN		instr_tgeiu
	JUN		instr_tlti
	JUN		instr_tltiu
	JUN		instr_teqi
	JUN		instr_undef		;13
	JUN		instr_tnei
	JUN		instr_undef		;15
	JUN		instr_bltzal
	JUN		instr_bgezal
	JUN		instr_bltzall
	JUN		instr_bgezall
	JUN		instr_undef		;20
	JUN		instr_undef		;21
	JUN		instr_undef		;22
	JUN		instr_undef		;23
	JUN		instr_undef		;24
	JUN		instr_undef		;25
	JUN		instr_undef		;26
	JUN		instr_undef		;27
	JUN		instr_undef		;28
	JUN		instr_undef		;29
	JUN		instr_undef		;30
	JUN		instr_undef		;31



instr_top_lvl_1:
;assumes instr word is in r8:r9:r10:r11:r12:r13:r14:r15
;get (((instr >> 16) & 0x1f) << 1) into r6:r7, then jump based on it
	LD		r11
	CLC
	RAL
	XCH		r7
	TCC
	XCH		r6
	LD		r10
	RAR
	LDM		0
	RAL
	RAL
	ADD		r6
	XCH		r6
	JIN		r6


cop0_read_entryHi:
	FIM		r4, DCL1_ENTRYHI
	JUN		copy_8_DCL1_to_DCL0_and_next_instr

cop0_read_status:
	FIM		r4, DCL1_STATUS
	JUN		copy_8_DCL1_to_DCL0_and_next_instr

cop0_read_cause:
	FIM		r4, DCL1_CAUSE
	JUN		copy_8_DCL1_to_DCL0_and_next_instr

cop0_read_entryLo:
	FIM		r4, DCL1_ENTRYLO
	JUN		copy_8_DCL1_to_DCL0_and_next_instr

cpuPrvZeroCheck8:	;r4:r5 = ptr, assumes pointer does not go over 16 byte boundary. advances pointer. on exit: C = "was all zeroes"
	LDM		8
	XCH		r0
	;fallthrough

cpuPrvZeroCheckEx:	;r4:r5 = ptr, r0 = 15 & (16 - len). assumes pointer does not go over 16 byte boundary. advances pointer. on exit: C = "was all zeroes"
	CLC
cpuPrvZeroCheckEx_loop:
	SRC		r4
	INC		r5
	RDM
	JCN		NZ, cpuPrvZeroCheckEx_fail
	ISZ		r0, cpuPrvZeroCheckEx_loop
	STC
cpuPrvZeroCheckEx_fail:
	BBL		0


cpuPrvGetTopBitOfRegS:					;into C, clobbers r6 and r7
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	LDM		7
	CLC
	ADD		r7
	XCH		r7
	SRC		r6
	RDM
	RAL
	BBL		0

cpuPrvSaveRetAddr:
	FIM		r4, DCL1_PC					;src = PC
	FIM		r2, 31 * 8					;dst = RA
	FIM		r6, 10H						;DCL1 to DCL0 copy
	JMS		cpuPrvCopy8_r4r5_to_r2r3_with_DCL
	FIM		r2, 31 * 8					;dst = RA
	LDM		8
	JUN		cpuAddNibble

cpuPrvTmp32cLsl2:						;on input A = "16 - num nibbles to shift". for 16 input it should be 16 - 5, for 26 bit - 16 - 7). clobbers r0, r1, r6, r7, r8, for simm16 use, fewer iterations are needed, but for imm26 we need them all, so do all
	XCH		r8
	LDM		16 - 2
	XCH		r7
cpuPrvTmp32cLsl2_lsl_outer:
	FIM		r0,	DCL1_TMP32C
	LD		r8
	XCH		r6
	CLC
cpuPrvTmp32cLsl2_lsl_inner:
	SRC		r0
	INC		r1
	RDM
	RAL
	WRM
	ISZ		r6, cpuPrvTmp32cLsl2_lsl_inner
	ISZ		r7, cpuPrvTmp32cLsl2_lsl_outer
	BBL		0

instr_uimm16_1reg_common:
	FIM		r0,	DCL1_TMP32C + 4
	CLC
	LDM		1
	DCL
	JMS		cpuPrvInstrToXimm16
	JUN		instr_Ximm16_1reg_common

instr_simm16_1reg_common:							;for instrs that need regS (goes to TMP32A) and simm16 (goes to TMP32C)
	FIM		r0,	DCL1_TMP32C + 3
	LDM		1
	DCL
	JMS		cpuPrvRegToSimm16
	;fallthrough

instr_Ximm16_1reg_common:
	JMS		cpuGetRegPtrS_r4r5
	FIM		r2, DCL1_TMP32A
	JUN		cpuPrvCopy8_r4r5_to_r2r3_DCL0to1

prvPointR4toTMP32C_R6toTMP32A:
	FIM		r6, DCL1_TMP32A
	FIM		r4, DCL1_TMP32C
	BBL		0

prvPointR4toTMP32A_R6toTMP32C:
	FIM		r4, DCL1_TMP32A
	FIM		r6, DCL1_TMP32C
	BBL		0

instr_tgei:
	JMS		instr_simm16_1reg_common
	JMS		prvPointR4toTMP32C_R6toTMP32A

instr_nor:
	JMS		cpuGetRegPtrD_r2r3_skip_if_zero	;into r2:r3
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	FIM		r12, 000H						;DCL=0
	JMS		cpuPrvThreeOperandNor_veneer
	JUN		next_instr

instr_xor:
	JMS		cpuGetRegPtrD_r2r3_skip_if_zero	;into r2:r3
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	FIM		r12, 000H						;DCL=0
	JMS		cpuPrvThreeOperandXor_veneer
	JUN		next_instr

	
instr_jal:
	JMS		cpuPrvSaveRetAddr
	;fallthrough

instr_j:
	LDM		1
	DCL
	LDM		16 - 7
	JMS		cpuPrvTmp32cLsl2
	FIM		r0, DCL1_NPC + 7
	SRC		r0
	RDM
	FIM		r0, DCL1_TMP32C + 7
	SRC		r0
	WRM
	JMS		cpuBranchToAddr
	JUN		emu_cycle

	
cpuPrvThreeOperandAdd:		;add. r6:r7 = whence, r4:r5 = what, r2:r3 = dst, clobbeers r0, leaves C in proper state, moves pointers by 7
	LDM		9
	XCH		r0
	CLC

cpuPrvThreeOperandAdd_inner:
	SRC		r6
	INC		r7
	RDM
	SRC		r4
	INC		r5
	ADM
	SRC		r2
	INC		r3
	WRM
	ISZ		r0, cpuPrvThreeOperandAdd_inner
	SRC		r6
	RDM
	SRC		r4
	ADM
	SRC		r2
	WRM
	BBL		0

cpuExtHypercall:			;responsible for jumping to next instr when done. opcode is in AT's lower nibble
	FIM		r12, 0
	JMS		cpuPrvHypercall_veneer
	JUN		next_instr

instr_tlti:
	JMS		instr_simm16_1reg_common
	JMS		prvPointR4toTMP32A_R6toTMP32C
	JMS		cpuPrvLt
	JCN		C, tcc_no_trap

tcc_take_trap:
	JUN		take_int_ovf_exc

instr_tgeiu:
	JMS		instr_simm16_1reg_common
	JMS		prvPointR4toTMP32C_R6toTMP32A

instr_tltiu:
	JMS		instr_simm16_1reg_common
	JMS		prvPointR4toTMP32A_R6toTMP32C
	JMS		cpuPrvLtu
	JCN		NC, tcc_take_trap

tcc_no_trap:
	JUN		next_instr


instr_teqi:
	JMS		instr_simm16_1reg_common
	JMS		prvPointR4toTMP32A_R6toTMP32C
	JMS		cpuPrvEq
	JCN		C, tcc_take_trap
	JUN		next_instr

instr_tnei:
	JMS		instr_simm16_1reg_common
	JMS		prvPointR4toTMP32A_R6toTMP32C
	JMS		cpuPrvEq
	JCN		NC, tcc_take_trap
	JUN		next_instr


instr_beq:
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	JMS		cpuPrvEq
	JCN		NC, tcc_no_trap
	JUN		cpuPrvRelBranch

instr_bne:
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	JMS		cpuPrvEq
	JCN		C, tcc_no_trap
	JUN		cpuPrvRelBranch


instr_beql:
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	JMS		cpuPrvEq
	JCN		NC, likely_no_branch
	JUN		cpuPrvRelBranch

instr_bnel:
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	JMS		cpuPrvEq
	JCN		C, likely_no_branch
	JUN		cpuPrvRelBranch

likely_no_branch:
	JUN		skip_next_instr

instr_blez:
	JMS		cpuGetRegPtrS_r4r5
	FIM		r6, 00H						;point to $zero
	JMS		cpuPrvLt					;calc signed r6:r7 < r4:r5. return bool in C, clobbers r0, r1, r8
	JCN		C, tcc_no_trap
	JUN		cpuPrvRelBranch

instr_bgtz:
	JMS		cpuGetRegPtrS_r4r5
	FIM		r6, 00H						;point to $zero
	JMS		cpuPrvLt					;calc signed r6:r7 < r4:r5. return bool in C, clobbers r0, r1, r8
	JCN		NC, tcc_no_trap
	JUN		cpuPrvRelBranch




instr_with_imm_get_dst_and_final_copy:
	JMS		cpuGetRegPtrT_r4r5_skip_if_zero

instr_with_imm_final_copy:
	FIM		r4, DCL1_TMP32B
	FIM		r6, 10H
	JMS		cpuGetRegPtrT_r4r5_skip_if_zero
	JUN		copy_8_and_next_instr

instr_addiu:
	JMS		instr_simm16_1reg_common
	JMS		prvPointR4toTMP32A_R6toTMP32C
	FIM		r2, DCL1_TMP32B
	JMS		cpuPrvThreeOperandAdd
	JUN		instr_with_imm_final_copy

instr_slti:
	JMS		instr_simm16_1reg_common
	JMS		prvPointR4toTMP32C_R6toTMP32A
	JMS		cpuGetRegPtrT_r4r5_skip_if_zero
	JMS		cpuPrvLt	;calc signed r6:r7 < r4:r5. return bool in C, clobbers r0, r1, r8
	JUN		set_dst_reg_simple_val_from_C

instr_sltiu:
	JMS		instr_simm16_1reg_common
	JMS		prvPointR4toTMP32C_R6toTMP32A
	JMS		cpuGetRegPtrT_r4r5_skip_if_zero
	JMS		cpuPrvLtu	;calc signed r6:r7 < r4:r5. return bool in C, clobbers r0, r1, r8
	JUN		set_dst_reg_simple_val_from_C

instr_andi:
	JMS		instr_uimm16_1reg_common
	JMS		prvPointR4toTMP32C_R6toTMP32A
	FIM		r2, DCL1_TMP32B
	FIM		r12, 001H					;DCL=1
	JMS		cpuPrvThreeOperandAnd_veneer
	JUN		instr_with_imm_final_copy

instr_ori:
	JMS		instr_uimm16_1reg_common
	JMS		prvPointR4toTMP32C_R6toTMP32A
	FIM		r2, DCL1_TMP32B
	FIM		r12, 001H					;DCL=1
	JMS		cpuPrvThreeOperandOrr_veneer
	JUN		instr_with_imm_final_copy

instr_xori:
	JMS		instr_uimm16_1reg_common
	JMS		prvPointR4toTMP32C_R6toTMP32A
	FIM		r2, DCL1_TMP32B
	FIM		r12, 001H					;DCL=1
	JMS		cpuPrvThreeOperandXor_veneer
	JUN		instr_with_imm_final_copy

instr_lui:
	JMS		cpuGetRegPtrT_r4r5_skip_if_zero
	JMS		prvCopyr2r3_to_r4r5
	JMS		cpuPrvZeroFill8				;yes we fill 4 too many but this is fewer bytes
	CLC
	LDM		4
	ADD		r3
	XCH		r3
	FIM		r4, DCL1_TMP32C
	FIM		r6, 10H
	FIM		r0, 0CCh		;copy 4 nibbles
	JMS		cpuPrvCopyX_r4r5_to_r2r3_with_DCL			;copy X nibbles from RAM at DCL_r6.r4:r5 to ram at DCL_r7.r2:r3. on it is assumed that the nibble addresses do not corss 16-boundary. to copy X bytes, set r0:r1 to 0x11 * (0x0f & (16 - x))

	JUN		next_instr

cpuPrvIsInKernelMode:									;return in C, corrupts r0:r1
	LDM		1
	DCL

cpuPrvIsInKernelMode_DCL1:								;return in C, corrupts r0:r1, assumes you are in DCL1
	FIM		r0, DCL1_STATUS
	SRC		r0
	RDM
	RAR
	RAR
	CMC
	BBL		0

cpuPrvCopAccess_1:
	LDM		1
	DCL
	FIM		r0, DCL1_STATUS + 7
	SRC		r0
	RDM
	RAR
	RAR
	JCN		C, cpuPrvCopAccess_1_ok
	LDM		1
	JUN		cpuPrvTakeCoprocUnusableExc
cpuPrvCopAccess_1_ok:
	BBL		0

instr_lb:
	LDM		0FH
	STC
	JUN		mem_load_common

instr_lh:
	LDM		0EH
	STC
	JUN		mem_load_common

instr_lw:
	LDM		0DH
	CLC
	JUN		mem_load_common

instr_lbu:
	LDM		0FH
	CLC
	JUN		mem_load_common

instr_lhu:
	LDM		0EH
	CLC
	JUN		mem_load_common

instr_sb:
	LDM		0FH
	JUN		mem_store_common

instr_sh:
	LDM		0EH
	JUN		mem_store_common

instr_sw:
	LDM		0DH
	JUN		mem_store_common


cpuPrvSetEntryHiVa:					;DCL1_TMP32B = addr
	FIM		r4, DCL1_TMP32B + 3
	FIM		r2, DCL1_ENTRYHI + 3
	JUN		cpuPrvCopy5_r4r5_to_r2r3_currDCL_noPtrRestore

cpuPrvSetBadVA:						;DCL1_TMP32B = addr (corrupted)
;cpu.badva = va;
	FIM		r4, DCL1_TMP32B
	FIM		r2, DCL1_BADVA
	JMS		cpuPrvCopy8_r4r5_to_r2r3_currDCL_noPtrRestore

;cpu.context = (cpu.context & CP0_CTX_PTEBASE_MASK) | (((va >> 12) << CP0_CTX_BADVPN2_SHIFT) & CP0_CTX_BADVPN2_MASK);

	;shift DCL1_TMP32B's top 20 bits left 2 (last carry out shoudl be carefully preserved)
	LDM		16 - 2
	XCH		r3
cpuPrvSetBadVA_loop1_outer:
	FIM		r0, DCL1_TMP32B + 3
	LDM		16 - 5
	XCH		r2
	CLC
cpuPrvSetBadVA_loop1_inner:
	SRC		r0
	INC		r1
	RDM
	RAL
	WRM
	ISZ		r2, cpuPrvSetBadVA_loop1_inner
	ISZ		r3, cpuPrvSetBadVA_loop1_outer

	;place the last shifted out bit into CONTEXT's bit 20
	TCC
	XCH		r2
	FIM		r0, DCL1_CONTEXT + 5
	SRC		r0
	RDM
	RAR
	XCH		r2
	RAR
	XCH		r2
	RAL
	WRM

	;copy the remaining 20 bits from DCL1_TMP32B to CONTEXT
	FIM		r4, DCL1_TMP32B + 3
	FIM		r2, DCL1_CONTEXT
	JUN		cpuPrvCopy5_r4r5_to_r2r3_currDCL_noPtrRestore

cpuPrvTakeTlbRefillExc:
	LDM		1
	DCL
	JMS		cpuPrvSetEntryHiVa
	JMS		cpuPrvSetBadVA
	FIM		r2, DCL1_BADVA + 7
	SRC		r2
	RDM
	RAL
	JCN		C, sendTlbExcToNormalVec
;only user-mode refill exceptions left now

	LD		r11
	JCN		Z, cpuPrvTakeTlbRefillExc_wasUserWrite

cpuPrvTakeTlbRefillExc_wasUserRead:
	FIM		r0, CP0_EXC_COD_TLBL * 4
	LDM		EXC_VEC_KU_REFILL
	JUN		take_exc_with_vec

cpuPrvTakeTlbRefillExc_wasUserWrite:
	FIM		r0, CP0_EXC_COD_TLBS * 4
	LDM		EXC_VEC_KU_REFILL
	JUN		take_exc_with_vec

cpuPrvTakeTlbModifiedExc:
	LDM		1
	DCL
	JMS		cpuPrvSetEntryHiVa
	JMS		cpuPrvSetBadVA
	FIM		r0, CP0_EXC_COD_MOD * 4
	JUN		take_exc_normal_vec

cpuPrvTakeTlbInvalidExc:		;DCL1_TMP32B = addr, r11 = isWrite
	LDM		1
	DCL
	JMS		cpuPrvSetEntryHiVa
	JMS		cpuPrvSetBadVA

sendTlbExcToNormalVec:
	LD		r11
	JCN		Z, cpuPrvTakeTlbInvalidExc_wasWrite

cpuPrvTakeTlbInvalidExc_wasRead:
	FIM		r0, CP0_EXC_COD_TLBL * 4
	JUN		take_exc_normal_vec

cpuPrvTakeTlbInvalidExc_wasWrite:
	FIM		r0, CP0_EXC_COD_TLBS * 4
	JUN		take_exc_normal_vec

cpuPrvTlbHashSearch:	;TMP32B = addr, matched entry idx in r0, C indicates success. on success, DCL *WILL* be set to TLB_ENTRIES_DCL. clobbers r0, r1, r2, r3, r4, r5, r6, r7, DCL
	;get current ASID	into r6:r7
	FIM		r2, DCL1_ENTRYHI + 1
	SRC		r2
	INC		r3
	RDM
	RAR
	RAR
	CLC
	RAL
	CLC
	RAL
	XCH		r7
	SRC		r2
	RDM
	XCH		r6

	;hash address
	FIM		r2, DCL1_TMP32B + 3
	SRC		r2
	RDM
	FIM		r2, DCL1_TMP32B + 6
	SRC		r2
	CLC
	ADM							;C:A now has the entry index. 
	XCH		r1
	LDM		DLC1_HASH_PRESENT SHR 5
	RAL
	XCH		r0					;r0:r1 = &DLC1_HASH_PRESENT[idx]
	SRC		r0
	RDM
	JCN		Z, cpuPrvTlbHashSearch_fail
	INC		r0
	INC		r0					;r0:r1 = &DLC1_HASH_HEADS[idx]
	SRC		r0
	RDM
	XCH		r0

cpuPrvTlbHashSearch_entry:
	LDM		3
	XCH		r1					;r0:r1 = &TLBENTRY[idx].va in TLB_ENTRIES_DCL
	FIM		r2, DCL1_TMP32B + 3
	LDM		16 - 5
	XCH		r4

cpuPrvTlbHashSearch_check_va_loop:
	LDM		1					;load nobble from DCL1_TMP32B (Addr)
	DCL
	SRC		r2
	INC		r3
	RDM
	XCH		r5
	LDM		TLB_ENTRIES_DCL
	DCL
	LD		r5
	SRC		r0
	INC		r1
	CLC
	SBM							;compare
	JCN		NZ, cpuPrvTlbHashSearch_entry_mismatch
	ISZ		r4, cpuPrvTlbHashSearch_check_va_loop

cpuPrvTlbHashSearch_entry_addr_match:	;DCL is TLB_ENTRIES_DCL
	;ASID must match or entry must be global
	LDM		10
	XCH		r1					;r0:r1 = &TLBENTRY[idx].flags in TLB_ENTRIES_DCL
	SRC		r0
	RDM
	RAR
	JCN		C, cpuPrvTlbHashSearch_entry_works

cpuPrvTlbHashSearch_entry_notGlobal:	;not global, so check ASID
	LDM		1
	XCH		r1
	SRC		r0
	INC		r1
	RDM
	CLC
	SUB		r7
	JCN		NZ, cpuPrvTlbHashSearch_entry_mismatch
	SRC		r0
	LD		r6
	CLC
	SBM
	JCN		NZ, cpuPrvTlbHashSearch_entry_mismatch

cpuPrvTlbHashSearch_entry_works:
	STC
	BBL		0

cpuPrvTlbHashSearch_entry_mismatch:		;check if there is a "next" entry
	RD1
	JCN		Z, cpuPrvTlbHashSearch_fail		;no next
	RD0
	XCH		r0
	JUN		cpuPrvTlbHashSearch_entry

cpuPrvTlbHashSearch_fail:
	CLC
	BBL		0


	NOP
	NOP
	NOP
	NOP
	NOP
	NOP		;space here

j_cpuPrvTakeAddressError
	JUN		cpuPrvTakeAddressError

j_cpuPrvTakeTlbRefillExc:
	JUN		cpuPrvTakeTlbRefillExc
                        
j_cpuPrvTakeTlbInvalidExc:
	JUN		cpuPrvTakeTlbInvalidExc

mem_instr_addr_calc:		;calc address into TMP32B
	JMS		instr_simm16_1reg_common
	JMS		prvPointR4toTMP32C_R6toTMP32A
	FIM		r2, DCL1_TMP32B
	JUN		cpuPrvThreeOperandAdd						;TMP32B now has the desired addr

mem_access_do:		;r8 = 15 - lg2(access_size_in_nibbles), r11 = accessType(MEM_*), NOT clobbered: r12, r13, r14, must rerturn r9 at value that r8 was going in, DATA: write into TMP32C, read from TMP32C
	LD		r8
	XCH		r9
	;check alignment
	FIM		r2, DCL1_TMP32B
	SRC		r2
	RDM
	JUN		align_check_check
align_check_loop:
	RAR
	JCN		C, j_cpuPrvTakeAddressError
align_check_check:
	ISZ		r8, align_check_loop

mem_access_do_with_calced_aligned_addr:		;r8 = 15 - lg2(access_size_in_nibbles), r9 = same as r8, TMP32B = addr, r11 = accessType(MEM_*), NOT clobbered: r12, r13, r14, DATA: write into TMP32A, read from TMP32A, must be called with DCL=1

;cpuPrvMemTranslate
	FIM		r2, DCL1_TMP32B + 7
	SRC		r2
	RDM
	RAL
	JCN		NC, cpuPrvMemTranslate_user
	XCH		r4
	;kernel addrs require kernel mode. check for that and report an address error if we fail the check
	JMS		cpuPrvIsInKernelMode_DCL1
	JCN		NC, j_cpuPrvTakeAddressError
	LD		r4
	RAL
	JCN		C, cpuPrvMemTranslate_kernel

cpuPrvMemTranslate_fixedMap
	SRC		r2
	RDM
	RAR
	TCC
	WRM
	JUN		mem_access_with_pa

cpuPrvMemTranslate_kernel:		;we already checked permissions, so now the paths are the same
cpuPrvMemTranslate_user:
	
	JMS		cpuPrvTlbHashSearch	;TMP32B = addr, matched entry idx in r0, C indicates success. on success, DCL *WILL* be set to TLB_ENTRIES_DCL. clobbers r0, r1, r2, r3, r4, r6, r7, DCL
	JCN		NC, j_cpuPrvTakeTlbRefillExc

	;check V
	LDM		10
	XCH		r1
	SRC		r0
	RDM
	RAR
	RAR
	JCN		NC, j_cpuPrvTakeTlbInvalidExc

	;check D if write
	RAR
	JCN		C, cpuPrvMemTranslate_success
	LD		r11
	CLC
	RAR
	JCN		NZ, cpuPrvMemTranslate_success
	JUN		cpuPrvTakeTlbModifiedExc


cpuPrvMemTranslate_success:
	LD		r0
	XCH		r4
	LDM		11
	XCH		r5
	FIM		r2, DCL1_TMP32B + 3
	FIM		r6, (TLB_ENTRIES_DCL SHL 4) + 1
	FIM		r0, 0BBh
	JMS		cpuPrvCopyX_r4r5_to_r2r3_with_DCL


mem_access_with_pa:		;we now have a PA in TMP32B. can be called externally but no perm checks are done. DCL = 1 on entry

;BASE			LENGTH			USE
;0x00000000		0x00800000		RAM	(two 8MByte chunks for ease of fast access)
;0x04000000		0x00800000		RAM	(two 8MByte chunks for ease of fast access)
;0x17000000		0x01000000		DecBusErrorReporter
;0x1c000000		0x01000000		DZ11 UART
;0x1d000000		0x01000000		DS1287 RTC
;0x1e000000		0x01000000		DecBusCSR
;we check for RAM access first as it is the most important

	FIM		r0, DCL1_TMP32B + 7
	SRC		r0
	RDM
	JCN		NZ,	mem_access_not_ram

	;be optimisting abut things being ram - it is faster. assume all things below 256M are ram
	JUN		memAccessSpiRam

mem_access_not_ram:
	DAC
	JCN		Z, mem_dispatch_0b0001x

mem_signal_bus_error:
	LDM		1
	DCL
	FIM		r4, DCL1_TMP32B
	FIM		r2, DCL1_BUS_ERROR_ADDR
	JMS		cpuPrvCopy8_r4r5_to_r2r3_currDCL_noPtrRestore
	LD		r11
	DAC
	DAC
	JCN		Z, mem_signal_bus_error_code_fetch

mem_signal_bus_error_data_access:
	FIM		r0, CP0_EXC_COD_DBE * 4
	JUN		take_exc_normal_vec

mem_signal_bus_error_code_fetch:
	FIM		r0, CP0_EXC_COD_IBE * 4
	JUN		take_exc_normal_vec

mem_dispatch_0b0001x:
	FIM		r0, DCL1_TMP32B + 6
	SRC		r0
	RDM
	STC
	RAL
	JCN		C, mem_dispatch_0b00011x
	CMA
	JCN		NZ, mem_signal_bus_error

	JUN		memAccessDecBusErrorReporter

mem_dispatch_0b00011x:
	RAL
	JCN		NC, mem_signal_bus_error

mem_dispatch_0b000111x:
	RAL
	JCN		C, mem_dispatch_0b0001111x

mem_dispatch_0b0001110x:
	RAL
	JCN		C, mem_dispatch_0b00011101x
	JUN		memAccessDZ11
mem_dispatch_0b00011101x:
	JUN		memAccessDS1287

mem_dispatch_0b0001111x:
	RAL
	JCN		C, mem_signal_bus_error
	JUN		memAccessDecBusCSR

mem_load_common:	;A = 15 - lg2(access_size_in_nibbles), C=SEXT?
	XCH		r8
	TCC
	XCH		r14
	JMS		cpuGetRegPtrT_r2r3
	JMS		prvSwapr2r3_with_r12r13
	LDM		MEM_READ_DATA
	XCH		r11
	JMS		mem_instr_addr_calc
	JMS		mem_access_do			;only returns on success
	JMS		prvSwapr2r3_with_r12r13
	JMS		dst_zero_check			;only returns if we should provide an answer
	;r14 now has sizing and extending info, TMP32A has the data
	;copy the relevant part of the data first
	FIM		r4, DCL1_TMP32C
	FIM		r6, 10H

	LD		r9
	XCH		r1
	CLC					;convert from r9 = 15 - lg2(access_size_in_nibbles) to r0 = r1 = r9 = 16 - access_size_in_nibbles
	LDM		1
mem_calc_move_sz:
	RAL
	ISZ		r9, mem_calc_move_sz
	CMA
	IAC
	XCH		r0
	LD		r0
	XCH		r1
	XCH		r9
	JMS		cpuPrvCopyX_r4r5_to_r2r3_with_DCL
	;extend must be done for accesses of under 4 byte length (r9 == 8). other valid values are 0xe and 0xf
	;check for that and bail if that was it
	LD		r9
	RAR
	RAR
	JCN		C, mem_load_common_extend
	JUN		next_instr

mem_load_common_extend:
		;we only get here for 1 and 2 byte (2 and 4 nibble) accesses
		;for them, r9 will be 0x0f and 0x0e respectively
		;number of useful nibbles will be 2 and 4, calced as 1 + ~(r9 << 1)
		;index of highest usable nibble is one less than that
	LD		r9
	CLC
	RAL
	CMA			;A is now the index of the last useful nibble
	CLC
	ADD		r3
	XCH		r3	;r2:r3 points to last nibble of useful data
	
	SRC		r2
	INC		r3
	RDM
	RAL			;C now has the bit we need to propagate
	TCC
	ADD		r14	;if sext was requested AND val is negative, A is now 2, else it is 1 or zero
	RAR
	CMA
	IAC		;A = nibble to fill the high bits with
	XCH		r1

	LDM		10
	STC
	SUB		r9
	STC
	SUB		r9
	XCH		r0	;r0 is now 16 - num_useless_nibbles

	LD		r1

mem_load_common_extend_loop:
	SRC		r2
	INC		r3
	WRM
	ISZ		r0, mem_load_common_extend_loop
	JUN		next_instr

cpuPrvTakeAddressError:		;DCL1_TMP32B = addr, r11 = isWrite
	JMS		cpuPrvSetBadVA
	LD		r11
	JCN		Z, cpuPrvTakeAddressError_wasWrite

cpuPrvTakeAddressError_wasRead:
	FIM		r0, CP0_EXC_COD_ADEL * 4
	JUN		take_exc_normal_vec

cpuPrvTakeAddressError_wasWrite:
	FIM		r0, CP0_EXC_COD_ADES * 4
	JUN		take_exc_normal_vec

mem_store_common:		;A = 15 - lg2(access_size)
	XCH		r8
	JMS		mem_instr_addr_calc
	JMS		cpuGetRegPtrT_r4r5		;into r4:r5, T is at bit 16
	FIM		r2, DCL1_TMP32C
	JMS		cpuPrvCopy8_r4r5_to_r2r3_DCL0to1

	;DCL now == 1. check for STATUS.ISC
	FIM		r4, DCL1_STATUS + 4
	SRC		r4
	RDM
	RAR
	JCN		C, skip_store_due_to_isc

	LDM		MEM_WRITE
	XCH		r11
	JMS		mem_access_do			;only returns on success

skip_store_due_to_isc:
	JUN		next_instr


cpuPrvRewindTwoRegPtrs:
	CLC
	LDM		8
	ADD		r5
	XCH		r5
	CLC
	LDM		8
	ADD		r7
	XCH		r7
	BBL		0

divZeroCheck:	;ptr in r4:r5, will jump to next instr if zero, clobers r0 and r1
	LD		r5
	XCH		r1
	JMS		cpuPrvZeroCheck8
	JCN		NC, divZeroCheckPasses
	JUN		next_instr

divZeroCheckPasses:
	LD		r1
	XCH		r5
	BBL		0

instr_break:
	FIM		r0, CP0_EXC_COD_BP * 4
	JUN		take_exc_normal_vec

instr_syscall:
	FIM		r0, CP0_EXC_COD_SYS * 4
	JUN		take_exc_normal_vec

instr_ll:
	JMS		cpuGetRegPtrT_r2r3
	JMS		prvSwapr2r3_with_r12r13
	LDM		0DH
	XCH		r8
	LDM		MEM_READ_DATA
	XCH		r11
	JMS		mem_instr_addr_calc
	JMS		mem_access_do			;only returns on success
	JMS		prvSwapr2r3_with_r12r13
	JMS		dst_zero_check			;only returns if we should provide an answer
	FIM		r4, DCL1_TMP32C
	FIM		r6, 10H
	JMS		cpuPrvCopy8_r4r5_to_r2r3_with_DCL
	LDM		1
	DCL
	FIM		r0, DCL1_CAUSE
	SRC		r0
	WR0								;DLC1_LLBIT
	JUN		next_instr

instr_sc:
	JMS		mem_instr_addr_calc
	JMS		cpuGetRegPtrT_r4r5		;into r4:r5, T is at bit 16
	LD		r4
	XCH		r12
	LD		r5
	XCH		r13
	FIM		r2, DCL1_TMP32C
	JMS		cpuPrvCopy8_r4r5_to_r2r3_DCL0to1
	FIM		r0, DCL1_CAUSE		;C = llbit, llbit = 0
	SRC		r0
	RD0								;DLC1_LLBIT
	CLC
	RAR
	WRM
	JCN		NC, sc_fails_no_llbit
	LDM		MEM_WRITE
	XCH		r11
	LDM		0DH
	XCH		r8
	JMS		mem_access_do			;only returns on success
	STC
sc_fails_no_llbit:
	JMS		prvSwapr2r3_with_r12r13
	JMS		dst_zero_check			;only returns if we should provide an answer
	JUN		set_dst_reg_simple_val_from_C

instr_cop1:
	;we allow CFC1 to read register 0 (FIR)
	;coprocessor instr is indicated by bits 28..31 being 0100. coprocessor number is encoded in bits 26..27, that it is a CFC is indicated by bits 21..25 being 00010
	;destination CPU reg is regT (bits 16..20), source coprocessor reg is encoded in regD slot (bits 11..15). bits 0..10 are SBZ but we do not care
	;by the time we get here, bits 26..31 have been verified, we need to verify bits 21..25 and 11..15. Conveniently those are bits for regS and regD respectively normally
	;our accessors shift reg numbers left by 3, so if we use them, we should check that regS pointer is 0x10 and that regD pointer is zero
	JMS		cpuPrvCopAccess_1
	JMS		cpuGetRegPtrS_r6r7			;into r6:r7
	LD		r7
	JCN		NZ, cop_inval
	LD		r6
	DAC
	JCN		NZ, cop_inval
	JMS		cpuGetRegPtrD_r2r3			;into r2:r3, D is at bit 11
	LD		r3
	JCN		NZ, cop_inval
	LD		r2
	JCN		NZ, cop_inval
	;we now know that this is a CFC1 to read FIR
	JMS		cpuGetRegPtrT_r4r5_skip_if_zero
	CLC
	JUN		set_dst_reg_simple_val_from_C

cop_inval:
	JUN		instr_undef


cop0_read_common:	;weird subroutine to save space: DCL = 0; r2:r3 += 2; DO_SRC(r2:r3); *MEM = r4
	LDM		0
	DCL
	INC		r3
	INC		r3
	SRC		r2
	LD		r4
	WRM
	BBL		0

cop0_read_common_2:	;weird subroutine to save space: r2:r3 += 5; DO_SRC(r2:r3)
	LDM		5
	CLC
	ADD		r3
	XCH		r3
	SRC		r2
	BBL		0

cop0_write_common:	;weird subroutine to save space: r2:r3 += 2; DO_SRC(r2:r3); r4 = *MEM
	INC		r3
	INC		r3
	SRC		r2
	RDM
	XCH		r4
	BBL		0

cop0_read_index:				;r2:r3 = &regT, DCL = 1, regT is zeroed
	FIM		r0, DCL1_INDEX_IDX
	SRC		r0
	RDM	
	XCH		r4					;"idx" field of MIPS's index register
	RD0							;DCL1_INDEX_P	
	XCH		r5					;"P" field of MIPS's index register
	JMS		cop0_read_common	;weird subroutine to save space: DCL = 0; r2:r3 += 2; *r2:r3 = r4			(write IDX field)
	JMS		cop0_read_common_2	;weird subroutine to save space: r2:r3 += 5; DO_SRC(r2:r3)					(write P field)
	LD		r5
	WRM
	JUN		next_instr

cop0_read_random:
	FIM		r0, DCL1_RANDOM
	SRC		r0
	RDM
	XCH		r4
	JMS		cop0_read_common	;weird subroutine to save space: DCL = 0; r2:r3 += 2; *r2:r3 = r4
	JUN		next_instr

cop0_read_PRID:		;PRID_VALUE for R3000 is  0x220
	LDM		0
	DCL
	LDM		2
	INC		r3
	SRC		r2
	WRM
	INC		r3
	SRC		r2
	WRM
	JUN		next_instr

cop0_write_entryHi:
	FIM		r4, DCL1_ENTRYHI
	JUN		cop0_write_fullreg

cop0_write_epc:
	FIM		r4, DCL1_EPC
	;fallthrough

cop0_write_fullreg:
	JMS		prvSwapr2r3_with_r4r5
	JUN		copy_8_DCL0_to_DCL1_and_next_instr

tlbw_get_next:					;IN: r2 = entryIDX. OUT: r4 = TLB[entryIDX].nextIdx, C = TLB[entryIDX].haveNext
	LDM		0
	XCH		r3
	SRC		r2
	INC		r3
	RDM
	XCH		r4					;r4 = "nextIdx if any"
	SRC		r2
	RDM
	RAR
	BBL		0


cop0_write_cause:				;DCL = 0, r2:r3 = &regT
	;i32a = (3 << CP0_CAUSE_IP_SHIFT);	//0x0000_0300
	;cpu.cause = (cpu.cause &~ i32a) | (cpuGetRegT(instr) & i32a);
	JMS		cop0_write_common	;weird subroutine to save space: r2:r3 += 2; DO_SRC(r2:r3); r4 = *MEM
	LDM		1
	DCL
	FIM		r0, DCL1_CAUSE + 2
	SRC		r0		;want: abfg
	RDM				;C:? A:abcd	R4:defg
	RAR				;C:d A:?abc	R4:defg
	RAR				;C:c A:d?ab	R4:defg
	XCH		r4		;C:c A:defg	R4:d?ab
	RAL				;C:d A:efgc	R4:d?ab
	RAL				;C:e A:fgcd	R4:d?ab
	RAL				;C:f A:gcde	R4:d?ab
	XCH		r4		;C:f A:d?ab	R4:gcde
	RAL				;C:d A:?abf	R4:gcde
	XCH		r4		;C:f A:gcde	R4:?abf
	RAL				;C:g A:cdef	R4:?abf
	XCH		r4		;C:g A:?abf	R4:cdef
	RAL				;C:? A:abfg	R4:cdef
	WRM
	JMS		cpuPrvIrqsRecalc	;we could have fucked with interrupts
	JUN		next_instr

cop0_read_context:
	FIM		r4, DCL1_CONTEXT
	JUN		copy_8_DCL1_to_DCL0_and_next_instr


cop0_write_status:
	FIM		r4, DCL1_STATUS
	JMS		prvSwapr2r3_with_r4r5
	JMS		cpuPrvCopy8_r4r5_to_r2r3_DCL0to1
	JMS		cpuPrvIrqsRecalc	;we could have fucked with irqs
	JUN		next_instr

instr_XwX_common_start_read:
	LDM		MEM_READ_DATA		;read perm checks
	;fallthrough

instr_XwX_common_start:
	XCH		r15
	JMS		cpuGetRegPtrT_r2r3
	JMS		prvSwapr2r3_with_r12r13
	;this is the same as mem_instr_addr_calc, but we have no stack levels left
	JMS		instr_simm16_1reg_common
	JMS		prvPointR4toTMP32C_R6toTMP32A
	FIM		r2, DCL1_TMP32B
	JMS		cpuPrvThreeOperandAdd						;TMP32B now has the desired addr
	;this is the same as mem_instr_addr_calc, but we have no stack levels left
	LD		r15
	XCH		r11
	FIM		r0, DCL1_TMP32B
	SRC		r0
	RDM
	XCH		r14
	RDM
	RAR
	RAR
	CLC
	RAL
	CLC
	RAL
	WRM				;clear bottom 2
	LD		r14
	CLC
	SBM
	XCH		r14		;low bits
	FIM		r8, 0DDH
	JUN		mem_access_do_with_calced_aligned_addr		;word read. data ia now in TMP32A, PA in TMP32B (for mem_access_with_pa)




instr_Xwl_calc:
		;len_in_nibbles = 2 + 2 * (Addr & 3)
		;r0 for cpuPrvCopyX_r4r5_to_r2r3_with_DCL = 16 - len_in_nibbles = 16 - (2 + 2 * (Addr & 3)) = 14 - 2 * (Addr & 3)
		;reg += 6 - 2 * (Addr & 3)
	LD		r14							;addr & 3
	STC
	RAL									;A = (addr & 3) * 2 + 1, C = 0
	CMA									;A = 16 - ((addr & 3) * 2 + 2) == 16 - len_in_nibbles
	XCH		r0
	LDM		6		;carry still 0
	ADD		r3
	CLC
	SUB		r14
	CMC
	SUB		r14
	XCH		r3
	BBL		0

instr_Xwr_calc:
		;len_in_nibbles = 8 - 2 * (Addr & 3)
		;r0 for cpuPrvCopyX_r4r5_to_r2r3_with_DCL = 16 - len_in_nibbles = 8 + 2 * (Addr & 3)
		;mem += 2 * (Addr & 3)
	LDM		8
	CLC
	ADD		r14	;guaranteed to not carry
	ADD		r14	;guaranteed to not carry
	XCH		r0
	LD		r14
	RAL			;guaranteed to not carry
	FIM		r4, DCL1_TMP32C
	ADD		r5
	XCH		r5
	BBL		0



cop0_write_index:		;DCL = 0, r2:r3 = &regT
	JMS		cop0_write_common		;weird subroutine to save space: r2:r3 += 2; DO_SRC(r2:r3); r4 = *MEM		(read IDX part of mips's index register)
	JMS		cop0_read_common_2		;weird subroutine to save space: r2:r3 += 5; DO_SRC(r2:r3)					(point to the P part of mips's index register)
	RDM
	XCH		r5						;"P" part of mips's index register
	LDM		1
	DCL
	FIM		r0, DCL1_INDEX_IDX
	SRC		r0
	LD		r5
	WR0								;DCL1_INDEX_P
	RD1								;DCL1_NUM_TLB_ENTRIES_M1
	;modular reduction of written value to be less than num tlb entries.
	;this is the most compact way, speed is not that important - this is not done that often
	XCH		r4						;r4 is now "num tlb entries - 1", A is now "written value's low nibble"
	STC
idx_reduce:
	SUB		r4
	JCN		C, idx_reduce
	STC
	ADD		r4
	WRM
	JUN		next_instr


	org		0C00H

cop0_read_tab:					;table must be at a 16-byte-aligned address. handlers can rely on: r2:r3 = &regT, register has been zeroed, DCL = 1
	JUN		cop0_read_index
	JUN		cop0_read_random
	JUN		cop0_read_entryLo
	JUN		instr_undef			;3
	JUN		cop0_read_context
	JUN		instr_undef			;5
	JUN		instr_undef			;6
	JUN		instr_undef			;7
	JUN		cop0_read_badVA
	JUN		instr_undef			;9
	JUN		cop0_read_entryHi
	JUN		instr_undef			;11
	JUN		cop0_read_status
	JUN		cop0_read_cause
	JUN		cop0_read_epc
	JUN		cop0_read_PRID

cop0_write_tab:					;table must be at a 16-byte-aligned address. handlers can rely on: r2:r3 = &regT, DCL = 0
	JUN		cop0_write_index
	JUN		instr_undef			;1
	JUN		cop0_write_entryLo
	JUN		instr_undef			;3
	JUN		cop0_write_context
	JUN		instr_undef			;5
	JUN		instr_undef			;6
	JUN		instr_undef			;7
	JUN		cop0_write_badVA
	JUN		instr_undef			;9
	JUN		cop0_write_entryHi
	JUN		instr_undef			;11
	JUN		cop0_write_status
	JUN		cop0_write_cause
	JUN		cop0_write_epc
	JUN		instr_undef			;15



cpuPrvTakeCoprocUnusableExc_0:
	LDM		0

cpuPrvTakeCoprocUnusableExc:							;cpNo = A, assumes DCL=1
	;	cpu.cause = (cpu.cause &~ CP0_CAUSE_CE_MASK) | ((((uint32_t)cpNo) << CP0_CAUSE_CE_SHIFT) & CP0_CAUSE_CE_MASK);
	;	take_exc_normal_vec will write CAUSE.BD so we can clobber it. bit 30 is unused so we can clobber that too
	FIM		r0, DCL1_CAUSE + 7
	SRC		r0
	WRM

	;	cpuPrvTakeException(CP0_EXC_COD_CPU);
	FIM		r0, CP0_EXC_COD_CPU * 4
	JUN		take_exc_normal_vec


instr_cop0:
	JMS		cpuPrvIsInKernelMode
	JCN		NC, cpuPrvTakeCoprocUnusableExc_0
	LD		r9
	RAR
	XCH		r0
	LD		r10
	RAR
	XCH		r0	;lo byte
	RAR
	TCC
	JCN		Z, cop0_dispatch_lt16
	LD		r0
	JCN		NZ, cop0_inval

;COP0
	;skip verifying the zeroes (mask 0x01ffffe0)
	LD		r14
	JCN		Z, cop0_lt_16
	DAC
	JCN		NZ, cop0_inval

cop0_rfe:
	;cpu.llbit = 0;
	FIM		r0, DCL1_STATUS
	SRC		r0						;DLC1_LLBIT
	LDM		0
	WR0
	;cpu.status =
	;							(cpu.status &~ (CP0_STATUS_KUP | CP0_STATUS_IEP | CP0_STATUS_KUC | CP0_STATUS_IE)) |
	;							((cpu.status & (CP0_STATUS_KUO | CP0_STATUS_IEO | CP0_STATUS_KUP | CP0_STATUS_IEP)) >> 2);
	INC		r1
	RDM
	XCH		r2
	SRC		r0
	RDM
	RAR
	XCH		r2
	RAR	
	XCH		r2
	RAR
	XCH		r2
	RAR
	FIM		r0, DCL1_STATUS
	SRC		r0
	WRM
	JMS		cpuPrvIrqsRecalc		;we could have affect irq enablement

cop0_j_next_instr:
	JUN		next_instr

cop0_lt_16:
	LD		r15
	DAC
	JCN		Z, cop0_tlbr
	DAC
	JCN		Z, cop0_tlbwi
	LDM		6
	CLC
	SUB		r15
	JCN		Z, cop0_tlbwr
	IAC
	IAC
	JCN		Z, cop0_tlbp
	JUN		instr_undef

cop0_dispatch_lt16:
	LD		r0
	JCN		Z, cop0_mfc0
	DAC
	DAC
	JCN		Z, cop0_cfc0
	DAC
	DAC
	JCN		Z, cop0_mtc0
	LDM		8
	CLC
	SUB		r0
	JCN		Z, cop0_j_next_instr		;BC0F treated as a never-takenbranch
	;fallthrough

cop0_inval:
	JUN		instr_undef
	
cop0_mfc0:
cop0_cfc0:
	JMS		cpuGetRegPtrT_r4r5_skip_if_zero			;into r2:r3
	JMS		prvCopyr2r3_to_r4r5
	LDM		0
	DCL
	JMS		cpuPrvZeroFill8
	FIM		r0, cop0_read_tab
	LDM		1
	DCL
	;fallthrough

cop0_reg_access:								;r2:r3 = &regT
	LD		r13
	RAL
	LD		r12
	RAL
	JCN		C, cop0_inval
	;we know there is no carry
	RAL
	XCH		r1
	LDM		0
	ADD		r0
	XCH		r0
	JIN		r0

cop0_mtc0:
	JMS		cpuGetRegPtrT_r2r3				;into r2:r3
	FIM		r0, cop0_write_tab
	LDM		0
	DCL
	JUN		cop0_reg_access

cop0_tlbr:
	FIM		r0, DCL1_INDEX_IDX
	SRC		r0
	RDM
	XCH		r5
	LDM		0
	XCH		r4
	FIM		r2, DCL1_ENTRYHI
	FIM		r6, (TLB_ENTRIES_DCL SHL 4) + 1
	FIM		r0, 00H		;copy 16 nibbles
	JMS		cpuPrvCopyX_r4r5_to_r2r3_with_DCL
	JUN		next_instr

cop0_tlbp:
	FIM		r4, DCL1_ENTRYHI
	FIM		r2, DCL1_TMP32B
	JMS		cpuPrvCopy8_r4r5_to_r2r3_currDCL_noPtrRestore
	JMS		cpuPrvTlbHashSearch					;TMP32B = addr, matched entry idx in r0, C = success. on success, DCL *WILL* be set to TLB_ENTRIES_DCL. clobbers r0, r1, r2, r3, r4, r6, r7, DCL
	LDM		1
	DCL
	FIM		r2, DCL1_INDEX_IDX
	SRC		r2
	LD		r0
	WRM
	CMC
	LDM		0
	RAR
	WR0			;DCL1_INDEX_P
	JUN		next_instr

instr_cop3:
	;we assume all cop3 instrs are hypercalls as no others should exist
	JMS		cpuPrvIsInKernelMode
	JCN		C, do_hyper_call
	JUN		instr_undef
do_hyper_call:
	JUN		cpuExtHypercall		;responsible for jumping to next_instr at end and accessing regs as it needs

instr_swr:
		;copy from: start of reg
		;len_in_nibbles = 8 - 2 * (adr & 3)
		;copyTo: mem + 2 * (adr & 3)
	LDM		MEM_RMW
	JMS		instr_XwX_common_start	;r12:r13 is regT, r14 = low 3 bits of addr, TMP32C is loaded word
	JMS		prvSwapr2r3_with_r12r13
	JMS		instr_Xwr_calc
	JUN		inst_swX_common

cop0_tlbwi:
	FIM		r2, DCL1_INDEX_IDX
	JUN		tlb_write_at

cop0_tlbwr:
	FIM		r2, DCL1_RANDOM
tlb_write_at:
	SRC		r2
	RDM

cpuPrvTlbWrite:							;A = index. ENTRYHI/ENTRYLO have values
	XCH		r2							;r2:r3 points somewhere inside the TLB entry

cpuPrvTlbHashRemove:
	LDM		TLB_ENTRIES_DCL
	DCL
	SRC		r2
	RD3
	CLC
	RAR
	JCN		NZ, cpuPrvTlbHashRemove_havePrev

cpuPrvTlbHashRemove_noPrev:				;DCL = TLB_ENTRIES_DCL on entry
	TCC
	XCH		r0
	RD2
	XCH		r1							;r0:r1 is now "idx" aka "the bucket we are head of"

	RD1
	JCN		Z, cpuPrvTlbHashRemove_noPrev_noNext


cpuPrvTlbHashRemove_noPrev_yesNext:		;DCL = TLB_ENTRIES_DCL on entry

	RD0
	XCH		r4							;r4 = TLB[entryIDX].nextIdx
	
	;write bucket number into next entry's "prev"
	SRC		r4
	LD		r1
	WR2
	LD		r0
	WR3
	
	;write next entry's index into the bucket heads array
	LDM		1
	DCL
	CLC
	LDM		DLC1_HASH_HEADS SHR 4
	ADD		r0
	XCH		r0
	SRC		r0
	LD		r4
	WRM
	JUN		cpuPrvTlbHashRemove_done
	
	
cpuPrvTlbHashRemove_noPrev_noNext:		;DCL = TLB_ENTRIES_DCL on entry
	
	;DLC1_HASH_PRESENT[idx] = 0
	LDM		1
	DCL
	CLC
	LDM		DLC1_HASH_PRESENT SHR 4
	ADD		r0
	XCH		r0
	SRC		r0
	LDM		0
	WRM
	JUN		cpuPrvTlbHashRemove_done


cpuPrvTlbHashRemove_havePrev:			;DCL = TLB_ENTRIES_DCL on entry
	RD2
	XCH		r0							;r0 = index of prev entry

	RD1
	JCN		Z, cpuPrvTlbHashRemove_havePrev_noNext

cpuPrvTlbHashRemove_havePrev_haveNext:	;DCL = TLB_ENTRIES_DCL on entry, r0 = prevIdx
	RD0		;read next idx
	XCH		r4							;r4 = index fo next entry

	SRC		r0							;prev->next = next
	LD		r4
	WR0

	SRC		r4							;next->prev = prev
	LD		r0
	WR2

	JUN		cpuPrvTlbHashRemove_done

cpuPrvTlbHashRemove_havePrev_noNext:	;DCL = TLB_ENTRIES_DCL on entry, r0 = prevIdx
	SRC		r0							;prev->haveNextIdx = 0
	LDM		0
	WR1
	;fallthrough


cpuPrvTlbHashRemove_done:				;DCL varies on entry, r2 = entryIDX
	;write entry by copying from ENTRYHI/ENTRYLO to our index. skip the first nibble of entryhi, we do not care
	LDM		1
	XCH		r3
	FIM		r4, DCL1_ENTRYHI + 1
	FIM		r6, (1 SHL 4) + TLB_ENTRIES_DCL
	FIM		r0, 11H		;copy 15 nibbles
	JMS		cpuPrvCopyX_r4r5_to_r2r3_with_DCL
	
	;DCL is TLB_ENTRIES_DCL, r2:r3 = entry + 1, this is a good time to clear the bottom 2 bits to make ASID comparisons simpler
	SRC		r2
	RDM
	RAR
	RAR
	CLC
	RAL
	CLC
	RAL
	WRM

	;when we insert it, it'll have no prev but might have next
cpuPrvTlbHashAdd:
	
	;hash it
	LDM		3
	XCH		r3
	SRC		r2
	RDM
	INC		r3
	INC		r3
	INC		r3
	SRC		r2
	CLC
	ADM									;C:A = "idx" == our bucket idx
	XCH		r1
	LDM		DLC1_HASH_PRESENT SHR 5
	RAL
	XCH		r0							;r0:r1 = &DLC1_HASH_PRESENT[idx]

	LDM		1
	DCL

	SRC		r0							;r7 = hadNext = DLC1_HASH_PRESENT[idx]; DLC1_HASH_PRESENT[idx] = 1
	RDM
	XCH		r7
	LDM		1
	WRM

	INC		r0							;r0:r1 = &DLC1_HASH_HEADS[idx]
	INC		r0

	SRC		r0							;r6 = nextEntryIDX = DLC1_HASH_HEADS[idx]; DLC1_HASH_HEADS[idx] = entryIDX
	RDM
	XCH		r6
	LD		r2
	WRM

	LDM		TLB_ENTRIES_DCL
	DCL

	SRC		r2							;TLB[entryIDX].prevIdx = idx, TLB[entryIDX].havePrevIdx = 0
	LD		r1
	WR2
	LD		r0
	RAR
	TCC
	WR3

	LD		r7							;TLB[entryIDX].haveNextIdx = hadNext
	WR1
	JCN		Z, cpuPrvTlbHashAdd_nextHandled

cpuPrvTlbHashAdd_haveChain:				;we have a next. DCL = TLB_ENTRIES_DCL, r2 = entryIDX, r6 = nextEntryIDX

	LD		r6							;TLB[entryIDX].nextIdx = nextEntryIDX; 
	WR0

	SRC		r6							;TLB[nextEntryIDX].prevIdx = entryIDX; TLB[nextEntryIDX].havePrevIdx = 1;
	LD		r2
	WR2
	LDM		2
	WR3

cpuPrvTlbHashAdd_nextHandled:		;DCL = TLB_ENTRIES_DCL
	JUN		next_instr


instr_lwl:
		;copy_from start of mem (low end)
		;copy_to: reg + 6 - 2 * (Addr & 3)
		;len_in_nibbles = 2 + 2 * (Addr & 3)
		;r0 for cpuPrvCopyX_r4r5_to_r2r3_with_DCL = 16 - len_in_nibbles = 16 - (2 + 2 * (Addr & 3)) = 14 - 2 * (Addr & 3)
	JMS		instr_XwX_common_start_read	;r2:r13 is regT, r14 = low 3 bits of addr, TMP32C is loaded word
	JMS		prvSwapr2r3_with_r12r13
	JMS		dst_zero_check
	JMS		instr_Xwl_calc
	FIM		r4, DCL1_TMP32C
inst_lwX_common:
	FIM		r6, 10H
	JMS		cpuPrvCopyX_r4r5_to_r2r3_with_DCL	;we did not set r1, so pointers will be corrupted
	JUN		next_instr

instr_lwr:
		;copy to: start of reg
		;len_in_nibbles = 8 - 2 * (Addr & 3)
		;copyFrom: mem  + 2 * (Addr & 3)
		;r0 for cpuPrvCopyX_r4r5_to_r2r3_with_DCL = 16 - len_in_nibbles = 8 + 2 * (Addr & 3)

	JMS		instr_XwX_common_start_read	;r2:r13 is regT, r14 = low 3 bits of addr, TMP32C is loaded word
	JMS		prvSwapr2r3_with_r12r13
	JMS		dst_zero_check
	JMS		instr_Xwr_calc
	JUN		inst_lwX_common

instr_swl:
		;copy to start of mem
		;len_in_nibbles = 2 + 2 * (addr & 3)
		;copy from: reg + 2 * (addr & 3)
	LDM		MEM_RMW
	JMS		instr_XwX_common_start	;r12:r13 is regT, r14 = low 3 bits of addr, TMP32C is loaded word
	JMS		prvSwapr2r3_with_r12r13	;r2:r3 now points to regT
	JMS		instr_Xwl_calc			;adjusts r2:r3
	FIM		r4, DCL1_TMP32C
inst_swX_common:
	JMS		prvSwapr2r3_with_r4r5
	FIM		r6, 01H
	JMS		cpuPrvCopyX_r4r5_to_r2r3_with_DCL	;we did not set r1, so pointers will be corrupted
	;now write memory back
	LDM		0DH
	XCH		r9
	LDM		0
	XCH		r11
	JMS		mem_access_with_pa		;r9 = 15 - lg2(access_size_in_nibbles), r11 = accessType(MEM_*)
	JUN		next_instr
	




j_mem_signal_bus_error_1
	JUN		mem_signal_bus_error

;memory_accessor: ;r9 = 15 - lg2(access_size_in_nibbles), r11 = accessType(MEM_*), DCL1_TMP32B = addr, DCL1_TMP32C = data, on error jump to mem_signal_bus_error
					;r9 MUST be left unmolested!!!!!

memAccessDecBusErrorReporter:			;memory_accessor

	LDM		0DH		;verify word access
	CLC
	SUB		r9
	JCN		NZ, j_mem_signal_bus_error_1

	;assume addr zero - no other is sane to read
	;assume no RMW or instr fetches
	;ignore writes
	LD		r11
	JCN		NZ, memAccessDecBusErrorReporter_read
	BBL		0

memAccessDecBusErrorReporter_read
	FIM		r4, DCL1_BUS_ERROR_ADDR
	FIM		r2, DCL1_TMP32C
	JUN		cpuPrvCopy8_r4r5_to_r2r3_currDCL_noPtrRestore

memAccessDS1287:
	;assumes byte access, no RMW access, no instr fetches
	;cause[bit 13] says if RTC irq is signalling
	LD		r11
	JCN		Z, j_just_ret
	;read: provide the "read" zero byte
	LDM		0
	FIM		r4, DCL1_TMP32C
	SRC		r4
	INC		r5
	WRM
	SRC		r4
	WRM
	;clear irq if set
	FIM		r4, DCL1_CAUSE + 3
	SRC		r4
	RDM
	RAR
	RAR
	JCN		NC, j_just_ret
	;irq was set - clear it
	CLC
	RAL
	RAL
	WRM
	JMS		cpuPrvIrqsRecalc	;we fucked with interrupts

j_just_ret:
	BBL		0


instr_and:
	JMS		cpuGetRegPtrD_r2r3_skip_if_zero	;into r2:r3
	JMS		cpuGetRegPtrT_r4r5				;into r4:r5
	JMS		cpuGetRegPtrS_r6r7				;into r6:r7
	FIM		r12, 000H						;DCL=0
	JMS		cpuPrvThreeOperandAnd_veneer
	JUN		next_instr

instr_addi:
	JMS		instr_simm16_1reg_common
	FIM		r4, DCL1_TMP32A
	FIM		r6, DCL1_TMP32C
	FIM		r2, DCL1_TMP32B
	JMS		cpuPrvThreeOperandAdd
	JMS		add_common_ovf_handling
	JUN		instr_with_imm_get_dst_and_final_copy


cpuGetRegNumA_r6r7:	;into r6:r7, A is at bit 6
	LD		r13
	RAR
	XCH		r6
	LD		r14
	RAR
	XCH		r6
	RAR
	XCH		r6
	RAR
	XCH		r7
	LD		r6
	RAR
	TCC
	XCH		r6
	BBL		0


memAccessDecBusCSR:
	LD		r11
	JCN		NZ, mem_accessor_zero_result
	BBL		0

mem_accessor_zero_result:
	FIM		r4, DCL1_TMP32C
	JUN		cpuPrvZeroFill8	;it is always safe to over-write the entire word


j_mem_signal_bus_error_2
	JUN		mem_signal_bus_error

memAccessDZ11:
	;assumes no RMW access, no instr fetches
	LDM		0EH		;verify halfword access
	CLC
	SUB		r9
	JCN		NZ, j_mem_signal_bus_error_2

	FIM		r0, DCL1_TMP32B
	SRC		r0
	INC		r1
	RDM
	RAL
	TCC
	SRC		r0
	XCH		r1	;now has low byte's top bit
	RDM
	RAR
	LD		r1
	FIM		r4, DCL1_TMP32C
	FIM		r0, DCL1_DZ11_FLAGS
	JCN		C, dz11_access_hi_regs
dz11_lo_regs:
	JCN		Z, dz11_access_csr

dz11_access_rbuf_lpr:
	LD		r11
	JCN		NZ, dz11_read_rbuf

dz11_write_lpr:
	SRC		r4
	CLC
	RAL
	CLC
	RAL
	JCN		NZ, j_dz11_ret		;not our line

	;rxEna = val & LPR_RXON
	FIM		r4, DCL1_TMP32C + 3
	SRC		r4
	RDM
	RAR
	TCC
	SRC		r0
	WR3							;DCL1_DZ11_BITS_A
	;fallthrough

dz11_recalc:					;assumes r0:r1 == DCL1_DZ11_FLAGS
	FIM		r4, DCL1_CAUSE + 3
	SRC		r0
	RD0							;DCL1_DZ11_TCR
	RAR
	JCN		NC, dz11_tcr_0bit_clr

dz11_tcr_0bit_set:
	RD1							;DCL1_DZ11_BITS_B
	RAR
	JCN		C, dz11_set_irq			;TCR[0] and TIE set

dz11_tcr_0bit_clr:
	RD1							;DCL1_DZ11_BITS_B
	RAL
	JCN		NC, dz11_rie_clr

dz11_rie_set:
	RD2							;DCL1_DZ11_HAVE_RX
	JCN		NZ, dz11_set_irq		;RIE is set and rx buf is not empty

dz11_rie_clr:
dz11_clr_irq:
	SRC		r4
	RDM
	RAR
	CLC

dz11_wr_irq:
	RAL
	WRM
	JMS		cpuPrvIrqsRecalc		;we fucked with IRQs
	BBL		0

dz11_set_irq:
	SRC		r4
	RDM
	RAR
	STC
	JUN		dz11_wr_irq

dz11_read_rbuf:
	FIM		r2, DCL1_DZ11_RX_BUFPTR
	SRC		r2
	RD0									;DCL1_DZ11_RX_BUF_HI
	XCH		r0
	RD1									;DCL1_DZ11_RX_BUF_LO
	SRC		r4
	INC		r5
	WRM									;data.loNbble
	SRC		r4
	INC		r5
	LD		r0
	WRM									;dfata.hiNibble

	SRC		r4
	INC		r5
	LDM		0							;third nibble is zero
	WRM

	FIM		r0, DCL1_DZ11_FLAGS
	SRC		r0
	RD2							;DCL1_DZ11_HAVE_RX
	XCH		r2
	LDM		0
	WR2							;DCL1_DZ11_HAVE_RX
	LD		r2
	SRC		r4
	WRM
	JUN		dz11_recalc

dz11_access_hi_regs:
	JCN		Z, dz11_access_tcr

dz11_access_msr_tdr:
	LD		r11
	JCN		NZ, mem_accessor_zero_result	;read MSR is a zero

dz11_write_tdr:
	SRC		r4
	INC		r5
	RDM
	XCH		r1
	SRC		r4
	RDM
	XCH		r0
	JUN		cpuPrvPutchar_veneer

j_dz11_ret:
	BBL		0

dz11_access_tcr:
	LD		r11
	JCN		NZ, dz11_read_tcr

dz11_write_tcr:
	SRC		r4
	RDM
	SRC		r0
	WR0							;DCL1_DZ11_TCR
	JUN		dz11_recalc

dz11_read_tcr:
	SRC		r0
	RD0							;DCL1_DZ11_TCR
	SRC		r4
	INC		r5
	WRM
	LDM		16 - 3
	XCH		r0
	JUN		cpuPrvZeroFillEx

dz11_access_csr:
	LD		r11
	JCN		NZ, dz11_read_csr

dz11_write_csr:
	INC		r5
	SRC		r4
	RDM
	RAR
	JCN		NC, dz11_write_csr_reset_handled

dz11_reset:
	SRC		r0
	LDM		0
	WR0							;DCL1_DZ11_TCR
	WR2							;DCL1_DZ11_HAVE_RX
	WR3							;DCL1_DZ11_BITS_A

	;DCL1_DZ11_BITS_B is about to be writetn do do not bother clearing it

dz11_write_csr_reset_handled:

	SRC		r4
	RDM		;A = {?, RIE, ?, ?}
	XCH		r6
	INC		r5
	INC		r5
	SRC		r4
	RDM		;A = {?, TIE, ?, ?}
	RAL
	RAL		;C = TIE
	LD		r6
	RAL		;A = {RIE, ?, ?, TIE}
	SRC		r0
	WR1							;DCL1_DZ11_BITS_B
	JUN		dz11_recalc

dz11_read_csr:
	SRC		r4
	INC		r5
	LDM		0
	WRM				;bottom nibble has nothing

	SRC		r0
	RD1							;DCL1_DZ11_BITS_B
	RAL				;C = "RIE"
	LDM		8
	RAR				;A = {RIE,MSE,CLR==0,0}
	XCH		r6
	RD2							;DCL1_DZ11_HAVE_RX
	RAL				;C = have byte
	LD		r6
	RAR				;A = {RDONE,RIE,MSE,CLR}
	SRC		r4
	INC		r5
	WRM				;second nibble

	LDM		0		;third nibble also has nothing
	SRC		r4
	INC		r5
	WRM

	SRC		r0
	RD1							;DCL1_DZ11_BITS_B
	RAR				;C = "TIE"
	LDM		0
	RAR				;A = {TIE, 0, 0, 0}
	XCH		r6
	RD0							;DCL1_DZ11_TCR
	RAR				;C = TCR[bit 0] (used as TRDY)
	LD		r6
	RAR				;A = {TRDY, TIE, 0, 0}
	SRC		r4
	WRM
	BBL		0




cpuGetRegPtrS_r4r5:	;into r4:r5, S is at bit 21
	LD		r10
	RAL
	XCH		r5
	LD		r9
	RAL
	XCH		r5
	RAL
	XCH		r5
	RAL
	XCH		r4
	LD		r5
	RAL
	LDM		0
	RAR
	XCH		r5
	BBL		0


	NOP
	NOP
	NOP
	NOP
	NOP
	NOP
	NOP
	NOP
	NOP
	NOP
	NOP
	NOP
	NOP
	NOP
	NOP
	NOP
	NOP
	NOP
	NOP
	NOP
	NOP		;space here



memAccessSpiRam:
		;numNibbles = 1 << (16 - r9), r11 = MEM_* (operation), addr is in DCL1_TMP32B, data goes to DCL1_TMP32C.
		;	both of those values are in the 4002 that has the PSRAM SPI TX port
		; for ram access code for the emu: cannot clobber: r9, r12, r13, DCL1_TMP32B

	LD		r11
	DAC
	DAC
	JCN		Z, code_ram_access

normal_ram_access:
	CLC
	LDM		0DH
	SUB		r9
	JCN		NZ, memAccessSpiRam_notWord

memAccessSpiRam_word:
	LDM		16 - 8

memAccessSpiRam_accessSizeHandled:		;A = 16 - num nibbles
	XCH		r8

	;pick a chip, store proper value to r10 (this SRC also selects the proper ram chip for output port)
	FIM		r0, DCL1_TMP32B + 6
	SRC		r0
	CLC
	LDM		3
	ADM									;now has 0b011 for chip 0 and 0xb101 for chip 1 (proper chip select values)
	XCH		r10

	FIM		r4, DCL1_TMP32C
	FIM		r0, DCL1_TMP32B + 5

	;select (carry is still 0), clock and data lines low. carry is still zero
	LD		r10
	RAL
	WMP

	;send command
	LDM		0
	JMS		spiRamSendNibble

	LD		r11
	JCN		Z, memAccessSpiRam_getCmdWrite

memAccessSpiRam_getCmdRead:
	LDM		3
memAccessSpiRam_sendCmdLoNibble:
	JMS		spiRamSendNibble

	;send addr
	FIM		r0, DCL1_TMP32B + 5
	SRC		r0
	RDM
	JMS		spiRamSendNibble

	FIM		r0, DCL1_TMP32B + 4
	SRC		r0
	RDM
	JMS		spiRamSendNibble

	FIM		r0, DCL1_TMP32B + 3
	SRC		r0
	RDM
	JMS		spiRamSendNibble

	FIM		r0, DCL1_TMP32B + 2
	SRC		r0
	RDM
	JMS		spiRamSendNibble

	FIM		r0, DCL1_TMP32B + 1
	SRC		r0
	RDM
	JMS		spiRamSendNibble

	FIM		r0, DCL1_TMP32B + 0
	SRC		r0
	RDM
	JMS		spiRamSendNibble

	LD		r11
	JCN		Z, memAccessSpiRam_doWrite

memAccessSpiRam_doRead:

	;adjust r10
	LD		r10
	CLC
	RAL
	XCH		r10
	LD		r10
	DAC
	DAC
	XCH		r3

memAccessSpiRam_doRead_loop:

	SRC		r4
	INC		r5

	;this is spiRecvNibble_0 START
	LD		r10	;chip select low, clock low, data high
	WMP
	RDR
	XCH		r6
	LD		r3	;chip select low, clock high, data high
	WMP

	LD		r10	;chip select low, clock low, data high
	WMP
	RDR
	RAR
	LD		r6
	RAL
	XCH		r6
	LD		r3	;chip select low, clock high, data high
	WMP

	LD		r10	;chip select low, clock low, data high
	WMP
	RDR
	RAR
	LD		r6
	RAL
	XCH		r6
	LD		r3	;chip select low, clock high, data high
	WMP

	LD		r10	;chip select low, clock low, data high
	WMP
	RDR
	RAR
	LD		r6
	RAL
	WRM
	LD		r3	;chip select low, clock high, data high
	WMP
	;this is spiRecvNibble_0 STOP

	ISZ		r8, memAccessSpiRam_doRead_loop
	
	LD		r10	;lower clock while keeping nCS low
	;fallthrough
	
memAccessSpiRam_finish:	;A = lower clock while keeping nCS low

	WMP
	LDM		2	;raise nCS while keeping clock low
	WMP
	BBL		0

memAccessSpiRam_doWrite:
	SRC		r4
	RDM
	INC		r5
	JMS		spiRamSendNibble
	ISZ		r8, memAccessSpiRam_doWrite

	;adjust r10 as needed
	LD		r10
	CLC
	RAL

	JUN		memAccessSpiRam_finish

memAccessSpiRam_getCmdWrite:
	LDM		2
	JUN		memAccessSpiRam_sendCmdLoNibble

memAccessSpiRam_notWord:	;we get here only for 1 and 2 byte accesses (r9 is 15 or 14 respectively), we need to produce 
	LD		r9
	CLC
	RAL
	JUN		memAccessSpiRam_accessSizeHandled










		;addr is in DCL1_TMP32B, data goes to DCL1_TMP32C.
		;	both of those values are in the 4002 that has the PSRAM SPI TX port
		; for ram access code for the emu: cannot clobber: r9, r12, r13, DCL1_TMP32B
code_ram_access:
	FIM		r0, DCL1_TMP32B + 6
	SRC		r0
	RDM
	JCN		Z, first_ram_chip

second_ram_chip:
	FIM		r10, 05BH			;write val, read val with clock low, data low
	FIM		r2, 098H			;read val with clock high data low, counter for nibbles
	JUN		picked_ram_chip

first_ram_chip:
	FIM		r10, 037H			;write val, read wal with clock low, data low
	FIM		r2, 058H			;read val with clock high data low, counter for nibbles

picked_ram_chip:
	FIM		r0, 0AEH

initial_zeroes:
	LD		r11
	WMP
	LD		r2
	WMP
	ISZ		r0, initial_zeroes

initial_ones:
	LD		r11
	DAC
	WMP
	LD		r2
	DAC
	WMP
	ISZ		r1, initial_ones

	;send addr
	FIM		r0, DCL1_TMP32B + 5
	SRC		r0
	RDM
	JMS		spiRamSendNibble

	FIM		r0, DCL1_TMP32B + 4
	SRC		r0
	RDM
	JMS		spiRamSendNibble

	FIM		r0, DCL1_TMP32B + 3
	SRC		r0
	RDM
	JMS		spiRamSendNibble

	FIM		r0, DCL1_TMP32B + 2
	SRC		r0
	RDM
	JMS		spiRamSendNibble

	FIM		r0, DCL1_TMP32B + 1
	SRC		r0
	RDM
	JMS		spiRamSendNibble

	FIM		r0, DCL1_TMP32B + 0
	SRC		r0
	RDM
	JMS		spiRamSendNibble

	;get data
	
	FIM		r4, DCL1_TMP32C

instr_read_loop:

	SRC		r4
	INC		r5

	;this is spiRecvNibble_0 START
	LD		r11	;chip select low, clock low, data high
	WMP
	RDR
	XCH		r6
	LD		r2	;chip select low, clock high, data high
	WMP

	LD		r11	;chip select low, clock low, data high
	WMP
	RDR
	RAR
	LD		r6
	RAL
	XCH		r6
	LD		r2	;chip select low, clock high, data high
	WMP

	LD		r11	;chip select low, clock low, data high
	WMP
	RDR
	RAR
	LD		r6
	RAL
	XCH		r6
	LD		r2	;chip select low, clock high, data high
	WMP

	LD		r11	;chip select low, clock low, data high
	WMP
	RDR
	RAR
	LD		r6
	RAL
	WRM
	LD		r2	;chip select low, clock high, data high
	WMP
	;this is spiRecvNibble_0 STOP

	ISZ		r3, instr_read_loop
	
;finish it			;A = lower clock while keeping nCS low

	LD		r11	;lower clock while keeping nCS low
	WMP
	LDM		2	;raise nCS while keeping clock low
	WMP
	BBL		0


	END
