# # linux_logo in microblaze assembler 0.38 # # By # Vince Weaver # # assemble with "as -o ll.o ll.microblaze.s" # link with "ld -o ll ll.o" # # I have to cross-compile, so what I do is was more like # make CROSS=/usr/local/bin/mb- ARCH=microblaze # # I use qemu for simulating this code (I have no microblaze hardware) # # Architectural Info # Big-endian # 3-operand # 32 32-bit registers # r0 is a zero register # r1 is stack pointer # r2 = r/o small data area # r3,r4 = return value # r5-r10 = parameters # r11-r12 = temp vars # r13 = r/w small data area # r14 = return address for interrupt # r15 = return area for functions # r16 = return address for debug/breaks # r17 = exception return address # r18 = reserved for compiler # r19-r31 = general use? # has (optional) branch-delay slots # aligned memory accesses (can be configured otherwise v3.0 and later) # HW multiply (post Virtex-II) # System Calls # syscall number in r12 # params in r5-r10 # brki r14, 0x08 # nop # instruction set # 32-bit wide instructions # 16-bit immediates # usually add rd,ra,rb (rd=destination) # add, addc, addk, addkc [carry, keep carry means don't update carry] # addi, addic, addik, addikc [add immediate] # and, andi, andn, andni [and, and not] # beq, beqd, beqi, beqid [branch if equal, with delay, immediate] # bge, bged, bgei, bgeid [branch if greater or equal] # bgt, bgtd, bgti, bgtid [branch if greater than] # ble, bled, blei, bleid [branch if less or equal] # blt, bltd, blti, bltid [branh if less than] # bne, bned, bnei, bneid [branch if not equal] # br, bra, brd, brad, brld, bald [unconditional branch. l = and link] # bri, brai, brid, braid, brlid, bralid [unconditional branch immediate] # brk, brki [break] # bsrl, bsra, bsll [barrel shift right logical, right arith, left logic] # bsrli, bsrai, bslli [barrel shift immediate] # cmp, cmpu [compare] # get, nget, cget, ncget [read from interface] # idiv, idivu [divide. only valid if config'd for divider] # imm [load 16-bit immediate value to be used to make 32-bit immediate] # lbu, lbui [load byte unsigned] # lhu, lhui [load halfword unsigned] # lw, lwi [load word] # mfs, msrclr, msrset, mts [manipulate special reg] # mul, muli [multiply, if configured] # or, ori [ or ] # put, nput, cput, ncput [write to interface] # rsub, rsubc, rsubk, rsubkc [reverse subtract] # rsubi, rsubic, rsubik, subikc [reverse subtract immediate] # rtbd, rtid, rted [ return from break, interrupt, exception] # rtsd [return from subroutine. always has delay slot] # sb,sbi [store byte] # sext16, sext8 [sign extend] # sh, shi [store halfword] # sra, src, srl [shift right, arith, with carry, logical] # sw, swi [store word] # wdc, wic [write to data, instruction cache] # xor, xori [xor] # Optimization # + 1671 bytes - original version, ported from MIPS # + 1518 bytes - remove extraneous alignment of data segment # + 1490 bytes - make r20 be the "out_buffer" register # + 1334 bytes - make data loads use r19 and an offset # eliminate as many empty branch delay slots as possible # either by filling or replacing branch with no-delay version. # Had to be careful not to have a 32-bit load immediate # in a delay slot. # + 1318 bytes - Put text_buf into a reg and use ra+rb addressing # + 1314 bytes - remove un-needed register move # + 1310 bytes - re-optimize write_stdout # + 1298 bytes - have find_string use 3 bytes, not 4 .include "logo.include" # offsets into the results returned by the uname syscall .equ U_SYSNAME,0 .equ U_NODENAME,65 .equ U_RELEASE,65*2 .equ U_VERSION,(65*3) .equ U_MACHINE,(65*4) .equ U_DOMAINNAME,65*5 # offset into the results returned by the sysinfo syscall .equ S_TOTALRAM,16 # Offsets into the data segment # Wish I could get the assembler to do this automatically .equ VER_OFFSET, 0 .equ COMPILED_OFFSET, 10 .equ RAM_OFFSET,22 .equ BOGO_OFFSET,30 .equ LINEFEED_OFFSET,46 .equ DEFAULT_COLORS_OFFSET,48 .equ ESCAPE_OFFSET,54 .equ C_OFFSET,57 .equ CPUINFO_OFFSET,59 .equ ONE_OFFSET,73 .equ MHZ_OFFSET,78 .equ PROCESSOR_OFFSET,94 .equ LOGO_OFFSET,107 # Sycscalls .equ SYSCALL_EXIT, 1 .equ SYSCALL_READ, 3 .equ SYSCALL_WRITE, 4 .equ SYSCALL_OPEN, 5 .equ SYSCALL_CLOSE, 6 .equ SYSCALL_SYSINFO, 116 .equ SYSCALL_UNAME, 122 # .equ STDIN,0 .equ STDOUT,1 .equ STDERR,2 .globl _start _start: #========================= # PRINT LOGO #========================= # LZSS decompression algorithm implementation # by Stephan Walter 2002, based on LZSS.C by Haruhiko Okumura 1989 # optimized some more by Vince Weaver addi r19,r0,data_begin # point r19 at .data segment begin addi r20,r0,out_buffer # point r20 at out_buffer addi r8,r0,(N-F) # R addi r9,r19,LOGO_OFFSET # r9 points to logo addi r12,r0,logo_end # r12 points to end of logo add r21,r0,r20 # point r21 to out_buffer addi r26,r0,text_buf # point r26 to text_buf decompression_loop: lbu r22,r0,r9 # load in a byte addi r9,r9,1 # increment source pointer ori r22,r22,0xff00 # put 0xff in top as a hackish 8-bit counter # ugh this expands to two instructions # because the 16-bit immediate is sign-extended test_flags: cmp r18, r12, r9 # have we reached the end? beqi r18, done_logo # if so, exit andi r23,r22,0x1 # test to see if discrete char bneid r23,discrete_char # if set, we jump to discrete char # BRANCH DELAY SLOT srl r22,r22 # shift right logical by 1 offset_length: lbu r10,r0,r9 # load 16-bit length and match_position combo lbui r24,r9,1 # can't use lhu because might be unaligned addi r9,r9,2 # increment source pointer bslli r24,r24,8 or r24,r24,r10 bsrli r25,r24,P_BITS # get the top bits, which is length addi r25,r25,THRESHOLD+1 # add in the threshold? output_loop: andi r24,r24,(POSITION_MASK<<8+0xff) # get the position bits lbu r10,r26,r24 # load byte from text_buf[] addi r24,r24,1 # advance pointer in text_buf store_byte: sb r10,r0,r21 # store byte to output buffer addi r21,r21,1 # increment pointer sb r10,r8,r26 # store also to text_buf[r] addi r8,r8,1 # r++ addi r25,r25,-1 # decrement count bneid r25,output_loop # repeat until k>j #BRANCH DELAY SLOT andi r8,r8,(N-1) # wrap r if we are too big andi r23,r22,0xff00 # if 0 we shifted through 8 and must bnei r23,test_flags # re-load flags bri decompression_loop discrete_char: lbu r10,r0,r9 # load a byte addi r9,r9,1 # increment pointer brid store_byte # and store it # BRANCH DELAY SLOT addi r25,r0,1 # force a one-byte output # end of LZSS code done_logo: brlid r15,write_stdout # print the logo # BRANCH DELAY SLOT add r6,r0,r20 # point r6 to out_buffer first_line: #========================== # PRINT VERSION #========================== addi r24,r0,uname_info # r24 holds uname_info uname_call: addi r12,r0, SYSCALL_UNAME # put exit syscall in r12 add r5,r0,r24 # destination struct brki r14, 0x08 # syscall add r21,r0,r20 # point r21 to out_buffer os_name: # os-name from uname "Linux" brlid r15,strcat # BRANCH DELAY SLOT addi r5,r24,U_SYSNAME version: # source is " Version " brlid r15,strcat # call strcat # BRANCH DELAY SLOT addi r5,r19,VER_OFFSET # version from uname, ie "2.6.20" brlid r15,strcat # call strcat # BRANCH DELAY SLOT addi r5,r24,U_RELEASE compiled: # source is ", Compiled " brlid r15,strcat # call strcat # BRANCH DELAY SLOT addi r5,r19,COMPILED_OFFSET # compiled date brlid r15,strcat # call strcat # BRANCH DELAY SLOT addi r5,r24,U_VERSION brlid r15,center_and_print # center and print # BRANCH DELAY SLOT nop # no such instruction as brli # without a delay :( #=============================== # Middle-Line #=============================== middle_line: add r21,r0,r20 # point r21 to out_buffer #========= # Load /proc/cpuinfo into buffer #========= addi r12,r0,SYSCALL_OPEN # OPEN Syscall addi r5,r19,CPUINFO_OFFSET # '/proc/cpuinfo' add r6,r0,r0 # 0 = O_RDONLY brki r14,0x08 # syscall. fd in v0 # we should check that # return r3>=0 add r5,r3,r0 # copy r3 (the result) to r5 addi r12,r0,SYSCALL_READ # read() addi r6,r0,disk_buffer # point r6 to the buffer addi r7,r0,4096 # we assume cpuinfo file is <4096bytes brki r14,0x08 addi r12,r0,SYSCALL_CLOSE # close (to be correct) # fd should still be in r5 brki r14,0x08 #============= # Number of CPUs #============= number_of_cpus: # we cheat here and just assume 1. # I don't know if SMP microblaze machines exist brlid r15, strcat # BRANCH DELAY SLOT addi r5,r19,ONE_OFFSET # print "One" #========= # MHz #========= print_mhz: addi r5,r0,('M'<<16+'H'<<8+'z') # find '-MHz' and grab up to '.' brlid r15,find_string # BRANCH DELAY SLOT addi r6,r0,'.' # find up to "." # print "MHz" brlid r15,strcat # BRANCH DELAY SLOT addi r5,r19,MHZ_OFFSET #========= # Chip Name #========= chip_name: addi r5,r0,('r'<<16+'c'<<8+'h') # find 'Arch' and grab up to '\n' brlid r15 find_string # BRANCH DELAY SLOT addi r6,r0,'\n' # find up to "\n" # print "Processor, " brlid r15,strcat # BRANCH DELAY SLOT addi r5,r19,PROCESSOR_OFFSET #======== # RAM #======== ram: addi r12,r0,SYSCALL_SYSINFO # sysinfo() syscall addi r5,r0,sysinfo_buff brki r14,0x08 lwi r5,r5,S_TOTALRAM # size in bytes of RAM add r7,r0,r0 # print to strcat, not stderr brlid r15,num_to_ascii # BRANCH DELAY SLOT bsrli r5,r5,20 # divide by 1024*1024 to get M # print 'M RAM, ' brlid r15,strcat # call strcat # BRANCH DELAY SLOT addi r5,r19,RAM_OFFSET #======== # Bogomips #======== bogomips: addi r5,r0,('i'<<16+'p'<<8+'s') # find 'Mips' and grab up to \n brlid r15,find_string # BRANCH DELAY SLOT addi r6,r0,'\n' # find up to "\n" # bogo total follows RAM brlid r15,strcat # call strcat # BRANCH DELAY SLOT addi r5,r19,BOGO_OFFSET brlid r15,center_and_print # center and print # BRANCH DELAY SLOT nop # no such thing as brli :( #================================= # Print Host Name #================================= last_line: add r21,r0,r20 # point r21 to out_buffer # host name from uname() brlid r15,strcat # call strcat # BRANCH DELAY SLOT addi r5,r24,U_NODENAME brlid r15,center_and_print # center and print # BRANCH DELAY SLOT nop # brli doesn't exist :( # (.txt) pointer to default_colors brlid r15,write_stdout # BRANCH DELAY SLOT addi r6,r19,DEFAULT_COLORS_OFFSET #================================ # Exit #================================ exit: addi r12, r0, SYSCALL_EXIT # put exit syscall in r12 addi r5, r0,5 # return value brki r14, 0x08 # syscall #================================= # FIND_STRING #================================= # r5 is 3-char ascii string to look for # r6 is char to end at find_string: addi r11,r0,disk_buffer # look in cpuinfo buffer find_loop: # load unaligned 3 bytes into reg lbui r22,r11,0 # load first byte add r23,r22,r0 # move bslli r23,r23,8 # shift lbui r22,r11,1 # load second byte add r23,r22,r23 # move bslli r23,r23,8 # shift lbui r22,r11,2 # load third byte add r23,r22,r23 # move beqi r23,done # are we at EOF? # if so, done addi r11,r11,1 # increment pointer cmp r22,r23,r5 bnei r22, find_loop # do the strings match? # if not, loop # if we get this far, we matched addi r11,r11,4 # skip to spacing skip_spaces: lbui r22,r11,1 # repeat till we find non-space addi r11,r11,1 beqi r22,done # if 0, at end addi r23,r0,' ' cmp r23,r23,r22 blei r23,skip_spaces store_loop: lbu r22,r0,r11 # load value addi r11,r11,1 # increment beqi r22,done # off end, then stop cmp r23,r22,r6 beqi r23,done # is it end char? sb r22,r0,r21 # if not store and continue brid store_loop # loop # BRANCH DELAY SLOT addi r21,r21,1 # increment output pointer done: rtsd r15,8 # return (delay slot version) # BRANCH DELAY SLOT nop #============================== # center_and_print #============================== # string is in output_buffer (r21 points to end of string) # r31 trashed (backup of return address) center_and_print: add r31,r0,r15 # save return address add r23,r0,r20 # point r23 to beginning # end is in r21 rsub r5,r23,r21 # subtract end pointer from start # (cheaty way to get size of string) rsubi r5,r5,80 blti r5,done_center # don't center if > 80 srl r23,r5 # divide by 2, store for later brlid r15,write_stdout # print ESCAPE char # BRANCH DELAY SLOT addi r6,r19,ESCAPE_OFFSET addi r7,r0,1 # print to stdout brlid r15,num_to_ascii # print number of spaces # BRANCH DELAY SLOT add r5,r0,r23 # how much to shift to right brlid r15,write_stdout # BRANCH DELAY SLOT addi r6,r19,C_OFFSET # print "C" done_center: brlid r15,write_stdout # BRANCH DELAY SLOT add r6,r0,r20 # point to the string to print addi r6,r19,LINEFEED_OFFSET # print linefeed at end of line add r15,r0,r31 # restore return address # fall through to write_stdout #================================ # WRITE_STDOUT #================================ # r6 has string # r9,r12 destroyed write_stdout: addi r12, r0, SYSCALL_WRITE # Write syscall in r12 addi r5, r0, STDOUT # STDOUT in r5 add r7,r0,r0 # count in r7 str_loop1: lbu r9,r6,r7 # load byte at r10 beqi r9,str_done # if nul, done brid str_loop1 # loop # BRANCH DELAY SLOT addi r7,r7,1 # increment count str_done: brki r14, 0x08 # run the syscall rtsd r15,8 # return (branch delayed version) # BRANCH DELAY SLOT nop #======================= # num_to_ascii #======================= # r5 = value to print # r6 = pointer to output # r7 = 1 if stdout, 0 if strcat # r8,r11,r22 trashed num_to_ascii: addi r6,r0,ascii_buffer+10 # point to end of ascii_buffer div_by_10: addi r6,r6,-1 # point back one addi r11,r0,10 # divide by 10 idiv r22,r11,r5 # quotient in r22 muli r8,r22,10 # calculate remainder rsub r8,r8,r5 # remainder is in r7 addi r8,r8,0x30 # convert to ascii sb r8,r0,r6 # store to buffer add r5,r22,r0 # move old result into next divide bnei r22, div_by_10 write_out: bnei r7,write_stdout # if write_stdout, go there # else fall through to strcat add r5,r6,r0 #================================ # strcat #================================ # output_buffer_offset = r21 # string to cat = r5 # destroys r11 strcat: lbu r11,r0,r5 # load byte from string addi r5,r5,1 # increment string sb r11,r0,r21 # store byte to output_buffer bneid r11,strcat # if zero, we are done # BRANCH DELAY SLOT addi r21,r21,1 # increment output_buffer done_strcat: rtsd r15,8 # return # BRANCH DELAY SLOT addi r21,r21,-1 # correct pointer #=========================================================================== # section .data #=========================================================================== .data data_begin: ver_string: .ascii " Version \0" compiled_string: .ascii ", Compiled \0" ram_comma: .ascii "M RAM, \0" bogo_total: .ascii " Bogomips Total\0" linefeed: .ascii "\n\0" default_colors: .ascii "\033[0m\n\0" escape: .ascii "\033[\0" c: .ascii "C\0" .ifdef FAKE_PROC cpuinfo: .ascii "proc/c.ublaze\0" .else cpuinfo: .ascii "/proc/cpuinfo\0" .endif one: .ascii "One \0" mhz: .ascii "MHz Microblaze \0" processor: .ascii " Processor, \0" .include "logo.lzss_new" #============================================================================ # section .bss #============================================================================ .bss bss_begin: .lcomm out_buffer,16384 .lcomm text_buf, (N+F-1) .lcomm disk_buffer,4096 # we cheat!!!! .lcomm ascii_buffer,10 # 32 bit can't be > 9 chars # see /usr/src/linux/include/linux/kernel.h .lcomm sysinfo_buff,(64) .lcomm uname_info,(65*6)