# # linux_logo in RiSC assembler 0.41 # # by Vince Weaver # takes 872528 cycles on "nestle" running in-order RiSC-sim # cycle count will vary machine to machine as string lengths # for the system info aren't always going to be the same # # Dedicated to baby Sam Clemens, born while this was being worked on # # RiSC is described fully here: http://www.ece.umd.edu/~blj/RiSC/ # Features # + 16-bit architecture # + 3-operand instructions # + 7 general purpose int registers # + signed 7-bit immediate # + zero register # + big-endian (though no byte instructions so you can't tell) # + no unaligned loads # ABI # + r0 is always zero # + r7 often used as stack pointer or zero-page pointer # + r6 is jump-and-link register # + Syscalls - "sys 0". Number in r1, args in r2,r3,r4, etc. # Instructions # + add rd,r1,r2 - rd = r1+r2 # + addi rd,r1,IMM - rd = r1+IMM (signed 7-bit immediate) # + nand rd,r1,r2 - rd = ~(r1&r2) # + lui rd,IMM - rd = upper 10 bits of immediate # + sw rd,r1,IMM - rd is written to address formed by r1+IMM # + lw rd,r1,IMM - rd is loaded with memory found at r1+IMM # + beq rd,r1,IMM - if r1==0 then add IMM to PC # + jalr rd,r1 - rd=current pc, jump to addr in r1 # + halt - halt execution # + sys IMM - system call (4-bit IMMEDIATE) # + exc IMM - exception (4-bit IMMEDIATE) # # Pseudo-ops # + movi rd,IMM - move immediate, expands to lui/addi # + nop - is an add 0,0,0 # + lli rd,6bitImm - ? expands to addi rd,rd,6bitIMM # # Directives # + .space SIZE - alocate SIZE 16-bit zeros # + .fill VAL - place directly one 16-bit value VAL # Syscalls .equ SYSCALL_EXIT, 1 .equ SYSCALL_READ, 3 .equ SYSCALL_WRITE, 4 .equ SYSCALL_OPEN, 5 .equ SYSCALL_CLOSE, 6 .equ SYSCALL_SYSINFO,116 .equ SYSCALL_UNAME, 122 # File Descriptors .equ STDIN,0 .equ STDOUT,1 .equ STDERR,2 # Optimizations # + 1682 bytes - 2200536 cycles - Original fully working implementation # + 1676 bytes - 2200524 cycles - pre-initialize pointers on the stack # + 1650 bytes - 2183559 cycles - remove common movi's by stack addressing # + 1626 bytes - 2170094 cycles - have read_byte return r3 instead of on stack # + 1614 bytes - 2169767 cycles - add indirect jumps to avoid jalr # + 1586 bytes - 2169577 cycles - more stack pointers, for first line of output # + 1568 bytes - 2169751 cycles - stack pointers, for middle line of input # + 1558 bytes - 2169746 cycles - stack pointers, last line of input # + 1556 bytes - 2169746 cycles - optimize read_byte # + 1524 bytes - 2139850 cycles - optimize shift_left # + 1526 bytes - 978223 cycles - fix missing return at end of write_byte # + 1482 bytes - 877829 cycles - optimize shift_right # + 1480 bytes - 877706 cycles - optimize strcat # + 1462 bytes - 877559 cycles - optimize center_and_print # + 1456 bytes - 873014 cycles - optimize strlen # + 1452 bytes - 873002 cycles - optimize num_to_ascii # + 1430 bytes - 872568 cycles - optimize find_string # + 1424 bytes - 872551 cycles - misc stack pointers # + 1418 bytes - 872528 cycles - mask cleanup in write_byte ###################### # Initial stack layout ###################### # -15 num_to_ascii # -14 ascii_buffer # -13 disk_buffer # -12 strlen # -11 find_string # -10 center_and_print # -9 uname_info # -8 strcat # -7 write_stdout # -6 out_buffer # -5 shift_left_8 # -4 shift_right # -3 text_buf # -2 write_byte # -1 read_byte # 0 logo # +1 out_buffer (to be modified) # +2 text_ptr # +3 logo_byte # ----------------- Below this is in BSS, initialized to zero # +4 out_byte # +5 text_byte # +6 command # +7 mask # +8 position # +9 count .include "logo.include" _start: #========================= # PRINT LOGO #========================= movi 7,stack # point r7 to the stack # logo_ptr is already loaded at stack+0 # logo_byte is already 0 at stack+3 # output_ptr is already at stack+1 # output_byte is already 0 at stack+4 # r_ptr is already N-F at stack+2 # r_byte is alrady 0 at stack+5 decompression_loop: lw 5,7,0 # logo_ptr lw 4,7,3 # logo_byte lw 1,7,-1 # load read_byte jalr 6,1 # read byte # result in r3 sw 3,7,6 # store command byte sw 5,7,0 # store logo_ptr sw 4,7,3 # store logo_byte addi 1,0,1 # reset mask to "1" sw 1,7,7 # store mask test_flags: lw 1,7,0 # load logo_ptr movi 2,logo_end # compare with end addi 2,2,-1 # cheat! we have an odd number of bytes # so hard on this arch to detect properly beq 1,2,done_logo_jump not_done: lw 1,7,6 # load back command word lw 2,7,7 # load in mask nand 1,1,2 # mask nand 1,1,1 add 2,2,2 # shift mask left sw 2,7,7 # store out mask beq 1,0,offset_length # if low bit was not set, offset_length discrete_char: lw 5,7,0 # load in logo_ptr lw 4,7,3 # load in logo_byte lw 1,7,-1 # load read_byte target jalr 6,1 # read byte # result is in r3 sw 5,7,0 # store logo_ptr sw 4,7,3 # store logo_byte addi 1,0,1 # count is 1 (write one byte) sw 1,7,9 # store count beq 0,0,store_byte offset_length: lw 5,7,0 # load logo_ptr lw 4,7,3 # load logo_byte lw 1,7,-1 # load read_byte target jalr 6,1 # read byte add 2,0,3 # get result in r2 jalr 6,1 # call read_byte again # result is in r3 sw 5,7,0 # update logo_ptr sw 4,7,3 # update logo_byte # we want to shift r3 lw 1,7,-5 # load shift_left_8 target jalr 6,1 # shift_left_8 add 2,3,2 # merge into 16-bit value sw 2,7,8 # store as position # value to shift in r2 lui 3,0x400 # amount to shift (P_BITS) lw 1,7,-4 # shift_right target jalr 6,1 # shift right # result in r1 addi 1,1,3 # add in THRESHOLD+1 sw 1,7,9 # store as count beq 0,0,output_loop # Jump tables to handle jumps > 64 instructions # w/o these we'd need to jalr done_logo_jump: beq 0,0,done_logo test_flags_jump: beq 0,0,test_flags decompression_jump: beq 0,0,decompression_loop output_loop: movi 1,0x3ff # mask lw 2,7,8 # load in position nand 2,1,2 nand 2,2,2 sw 2,7,8 # store back masked pos # value to shift in r2 addi 3,0,2 # shift right by 1 (divide by 2) lw 1,7,-4 # load shift_right target jalr 6,1 # shift right # result in r1 lw 5,7,-3 # point to text_buf add 5,5,1 # add in position lw 2,7,8 addi 1,0,1 # mask for byte hi/lo nand 4,2,1 nand 4,4,4 lw 1,7,-1 # load target for read_byte jalr 6,1 # read byte # result is in r3 addi 2,2,1 # increment position sw 2,7,8 # store out to mem store_byte: lw 5,7,1 # load output_ptr lw 4,7,4 # load output_byte lw 1,7,-2 # load target for write_byte jalr 6,1 # write byte sw 4,7,4 # save output_byte sw 5,7,1 # save output_ptr lw 2,7,-3 # load text_buffer lw 5,7,2 # load r_ptr add 5,2,5 # text+r_ptr lw 4,7,5 # load r_byte lw 1,7,-2 # load target for write_byte jalr 6,1 # wrte byte sw 4,7,5 # store r_byte nand 1,2,2 # flip bits in TEXT addi 1,1,1 # add one (2s complement) add 1,5,1 # we effectively subtracted movi 2,511 # mask with (1024>>2)-1 nand 1,1,2 nand 1,1,1 sw 1,7,2 # store back r_ptr lw 1,7,9 # load in count addi 1,1,-1 # decrement sw 1,7,9 # save back count beq 1,0,done_count beq 0,0,output_loop done_count: lw 1,7,7 # load in mask lui 2,0x100 # see if we are done beq 1,2,decompression_jump beq 0,0,test_flags_jump done_logo: # because the logo is an odd number of # bytes, we cheat and put the ending # linefeed and null termination here lw 5,7,1 # output_ptr lw 4,7,4 # output_byte addi 3,0,10 # store a linefeed lw 2,7,-2 # load write_byte target jalr 6,2 # write byte lw 3,7,-6 # out_buffer lw 4,7,-7 # write_stdout target jalr 6,4 # write stdout first_line: # set up pointers lw 5,7,-6 # set 5/4 to out_buffer/byte add 4,0,0 #========================== # PRINT VERSION #========================== # call uname movi 1,SYSCALL_UNAME # put uname sysall in r1 lw 2,7,-9 # point to uname info sys 0 # syscall # print OS add 3,2,0 # point to first entry of uname_info add 2,0,0 # start at aligned byte lw 1,7,-8 # load strcat target jalr 6,1 # strcat # print Version string movi 3,ver_string # store version string lw 1,7,-8 # load strcat target jalr 6,1 # strcat # print Version info lw 3,7,-9 # point to first entry of uname_info addi 3,3,32 # skip to 2nd entry lw 1,7,-8 # load strcat target jalr 6,1 # strcat # print Compiled string movi 3,compiled_string # store version string lw 1,7,-8 # load strcat target jalr 6,1 # strcat # print Compiled info lw 3,7,-9 # point to first entry of uname_info addi 3,3,48 # skip to 3rd entry lw 1,7,-8 # load strcat target jalr 6,1 # strcat lw 1,7,-10 # load center_and_print target jalr 6,1 # center and print #=============================== # Middle-Line #=============================== middle_line: #========= # Load /proc/cpuinfo into buffer #========= addi 1,0,SYSCALL_OPEN # open() syscall movi 2,cpuinfo # "proc/cpu.RiSC" add 3,0,0 # 0 = O_RDONLY sys 0 # syscall. fd returned in r1 # we should check that return >=0 add 2,1,0 # copy the fd to the right place addi 1,0,SYSCALL_READ # read() syscall lw 3,7,-13 # point r3 to the disk buffer lui 4,0x200 # assume smaller than 512 bytes sys 0 # syscall addi 1,0,SYSCALL_CLOSE # close() the file sys 0 # fd already in r2 # restore pointer to output lw 5,7,-6 # set 5/4 to out_buffer/byte add 4,0,0 #============= # Number of CPUs #============= number_of_cpus: # we cheat here and just assume 1. movi 3,one # store "One" string add 2,0,0 lw 1,7,-8 # load strcat target jalr 6,1 # strcat #========= # MHz #========= print_mhz: movi 2,0x6672 # Find 'fr' lw 1,7,-11 # load find_string target jalr 6,1 # find_string movi 3,mhz # store "MHz" string lw 1,7,-8 # load strcat target jalr 6,1 # strcat #========= # Chip Name #========= chip_name: movi 2,0x7075 # Find 'pu' lw 1,7,-11 # load find_string target jalr 6,1 # find_string movi 3,processor # store "Processor " string lw 1,7,-8 # load strcat target jalr 6,1 # strcat #======== # RAM #======== ram: movi 1,SYSCALL_SYSINFO movi 2,sysinfo_buff sys 0 lw 2,2,3 # total ram is the 4th field lw 1,7,-15 # point to num_to_ascii target jalr 6,1 # num_to_ascii movi 3,ram_comma # store "K RAM, " string add 2,0,0 lw 1,7,-8 # load strcat target jalr 6,1 # strcat #========= # Bogomips #========= bogomips: movi 2,0x426f # Find 'Bo' lw 1,7,-11 # load find_string target jalr 6,1 # find_string movi 3,bogo_total # store "Bogomips Total" string lw 1,7,-8 # load strcat target jalr 6,1 # strcat lw 1,7,-10 # load center_and_print target jalr 6,1 # center_and_print #================================= # Print Host Name #================================= last_line: # restore pointer to output lw 5,7,-6 # set 5/4 to out_buffer/byte add 4,0,0 # host name from uname() lw 3,7,-9 # load uname_info addi 3,3,16 # point to 2nd entry add 2,0,0 lw 1,7,-8 # point to strcat target jalr 6,1 # strcat lw 1,7,-10 # point to center_and_print jalr 6,1 # center_and_print movi 3,default_colors lw 1,7,-7 # point to write_stdout target jalr 6,1 # write_stdout #================================ # Exit #================================ exit: addi 1,0,SYSCALL_EXIT # Put syscall in R1 add 2,0,0 # Exit with a 0 sys 0 # syscall halt # just in case #================================ # Read Byte #================================ # r5 points to byte to read # r4 stores odd/even info # r3 has result read_byte: addi 7,7,16 # move stack sw 6,7,6 # back up return reg sw 2,7,2 # back up r2 sw 1,7,1 # back up r1 lui 3,0x100 # load 0x100 for later use lw 2,5,0 # load a word from r5 beq 4,0,read_even read_odd: addi 3,3,-1 # r3=0xff, mask to get low byte nand 3,2,3 nand 3,3,3 add 4,0,0 # make even addi 5,5,1 # increment r5 beq 0,0,done_read read_even: # shift right by 8 movi 1,shift_right jalr 6,1 # result in r1 add 3,0,1 # move result to r3 addi 4,0,1 # make odd done_read: lw 6,7,6 # restore link reg lw 2,7,2 # restore r2 lw 1,7,1 # restore r1 addi 7,7,-16 # restore stack jalr 0,6 # return #================================ # write_byte #================================ # r5 points to byte to read # r4 stores odd/even info # r3 is value to store write_byte: addi 7,7,16 # move stack sw 6,7,6 # back up return reg sw 3,7,3 # back up r3 sw 2,7,2 # back up r2 sw 1,7,1 # back up r1 lw 1,5,0 # load existing halfword lui 2,0xff00 # mask base beq 4,0,write_even write_odd: nand 1,1,2 # mask to get high byte nand 1,1,1 add 1,1,3 # move in new low byte sw 1,5,0 # store new half-word add 4,0,0 # make even addi 5,5,1 # increment pointer beq 0,0,done_write write_even: nand 2,2,2 # create 0x00ff mask nand 2,1,2 nand 2,2,2 # erase top half # we want to shift r3 movi 1,shift_left_8 jalr 6,1 # shift_left_8 add 2,2,3 # put new value in sw 2,5,0 # store new half-word addi 4,0,1 # make odd done_write: lw 6,7,6 # restore link reg lw 3,7,3 # restore r3 lw 2,7,2 # restore r2 lw 1,7,1 # restore r1 addi 7,7,-16 # restore stack jalr 0,6 # return #================================ # shift_right #================================ # r2 = value to shift # r3 = amount to divide by # (power of 2 only) # r1 = return value shift_right: addi 7,7,16 # move stack sw 5,7,5 # back up r5 sw 4,7,4 # back up r4 add 1,0,0 # clear output addi 5,0,1 # adder shift_loop: nand 4,2,3 # use mask on original value nand 4,4,4 beq 4,0,no_add # if it's zero, no need to add add 1,1,5 # otherwise, add in the adder value no_add: add 3,3,3 # shift left the mask add 5,5,5 # shift left the adder beq 3,0,done_shift # if we've overflowed the mask, done beq 0,0,shift_loop done_shift: lw 5,7,5 # restore r5 lw 4,7,4 # restore r4 addi 7,7,-16 # restore stack jalr 0,6 # return #================================ # shift_left_8 #================================ # r3 = value to shift/result # r1 trashed shift_left_8: addi 1,0,8 # always shift by 8 left_loop: add 3,3,3 # shift left by 1 addi 1,1,-1 # decrement count beq 1,0,left_done beq 0,0,left_loop left_done: jalr 0,6 #============================ # strcat #============================ # input is 3/2 # output is 5/4 # r2 is cleared # r1 is trashed strcat: addi 7,7,16 # move stack sw 6,7,6 # back up r6 strcat_loop: sw 5,7,5 # back up r5 sw 4,7,4 # back up r4 add 5,3,0 # move input(r3) into r5 add 4,2,0 # move input_byte(r2) into r4 lw 1,7,-17 # point to read_byte target jalr 6,1 # read byte # result is in r3 sw 5,7,3 # store incremented input on stack add 2,4,0 # store incremented byte into r2 lw 5,7,5 # restore r5 lw 4,7,4 # restore r4 lw 1,7,-18 # point to write_byte target jalr 6,1 # write_byte add 1,3,0 # move byte into r1 lw 3,7,3 # restore input pointer beq 1,0,done_strcat # if 0, done beq 0,0,strcat_loop # else, loop done_strcat: lw 5,7,5 # restore r5 before last increment lw 4,7,4 # restore r4 before last increment add 2,0,0 # clear r2, makes repeated cats easier lw 6,7,6 # restore r6 addi 7,7,-16 # restore stack jalr 0,6 # return #============================ # center_and_print #============================ # out_buffer = string to print center_and_print: addi 7,7,16 # move stack sw 6,7,6 # back up r6 movi 3,escape # Print "^[[" lw 1,7,-23 # point to write_stdout target jalr 6,1 # write_stdout lw 5,7,-22 # point r3 to out_buffer lw 1,7,-28 # point to strlen target jalr 6,1 # strlen movi 1,80 # load in 80 nand 4,4,4 # two's complement the string length addi 4,4,1 # add 2,4,1 # subtract addi 3,0,2 # we want to divide by 2 lw 1,7,-20 # point to shift_right target jalr 6,1 # shift right add 2,1,0 # load back the result lw 5,7,-30 # point to our ascii_buffer add 4,0,0 lw 1,7,-31 # point to num_to_ascii target jalr 6,1 # num_to_ascii lw 3,7,-30 # Point to our ascii number and print lw 1,7,-23 # point to write_stdout target jalr 6,1 # write stdout movi 3,c # Print "C" lw 1,7,-23 # point to write_stdout target jalr 6,1 # write stdout lw 3,7,-22 # point r3 to out_buffer lw 1,7,-23 # point to write_stdout target jalr 6,1 # write stdout movi 3,linefeed # Point to string lw 6,7,6 # restore r6 addi 7,7,-16 # restore stack # fall through to write_stdout #============================ # write_stdout #============================ # r3 = string to print # r1,r2,r4,r5 = trashed write_stdout: sw 6,7,0 # store return value on stack add 5,0,3 # copy r3 to r5 movi 1,strlen jalr 6,1 addi 1,0,SYSCALL_WRITE # Put syscall in R1 addi 2,0,STDOUT # Stdout in R2 sys 0 # syscall lw 6,7,0 # load return off stack jalr 0,6 # return #=============================== # strlen #=============================== # r5 points to string (assumes starts at even address) # r1,r2 trashed # r4 returns count strlen: addi 4,0,0 # clear count strlen_loop: lw 2,5,0 # load 16-bit value lui 1,0xff00 # load mask nand 1,1,2 # nand it nand 1,1,1 # invert to get and beq 1,0,strlen_done # if 0, we're done addi 4,4,1 # increment count lw 2,5,0 # load 16-bit value beq 1,2,strlen_done # if the loaded string matches # the masked string, it means # the bottom byte is 0 addi 4,4,1 # increment count addi 5,5,1 # increment pointer beq 0,0,strlen_loop # loop strlen_done: jalr 0,6 # return #============================== # num_to_ascii #============================== # r5/r4=buffer # r2=value num_to_ascii: addi 7,7,16 # move stack sw 6,7,6 # save return value sw 5,7,5 # save the output_ptr sw 4,7,4 # save the output_byte sw 0,7,8 # seen non-zero = 0 movi 5,ten_thousand # point to end of tens array div_loop: lw 6,5,0 # load current power of 10 into r6 beq 6,0,done_div # if it's a zero, we're done add 3,0,0 # set our counter to zero sub_loop: add 1,6,0 # This code checks for r2