; ; linux_logo in hppa pa-risc assembler 0.18 ; ; By ; Vince Weaver ; ; assemble with "as -o ll.o ll.parisc.s" ; link with "ld -o ll ll.o" ; hppa specific things: ; * labels must begin in column zero ; * EQU statements have the label first, ie STDERR: .equ 2 ; * branch delay slots ; * can only shift left by a max of 3! ; use depw to do it instead! Also there is a shift amount reg.. ; * no and immediate instruction! ; * stack grows *up*, not down ; * no full hardware mul/div. Have to pipeline by hand ; * weird halfword arithmatic modes ; architecture hints ; 32 gp registers, %r0-%r31 ; %r0 is a zero register, %r1 target of ADDIL, %r31 target of BLE ; %cr11 (%sar) shift amount register? ; ; Calling convention: ; %r2 = return link? ; %r19-%r22 = t4-t1 (not a typo) temp registers ; %r23-%r26 = arg3-arg0 argument registers ; %r27 = data pointer ; %r28 = ret0 = return value ; %r29 = ret1 = return value, static link ; %r30 = stack pointer ; %r31 = milicode return link ; ; To load a 32 bit word, L'SYMBOL refers to left 21 bits ; R'SYMBOL refers to right 11 bits ; ie: LDIL L'START,%r1 ; LDO R'START(%r1),%r1 ; ; Instructions are source, source, destination, ie add %r1,%r2,%r3 = r3=r2+r1 ; ; Weird concept of "spaces" ; ; ble 0x100(%sr2,%r0) ; is a syscall instruction. Linux Syscalls are in area 0x100 ; WARNING! BE CAREFUL! I managed to lock hard a pa-risc system ; by using an ill-formed syscall instruction. .include "logo.include.parisc" # offsets into the results returned by the uname syscall U_SYSNAME: .equ 0 U_NODENAME: .equ 65 U_RELEASE: .equ 65*2 U_VERSION: .equ 65*3 U_MACHINE: .equ 65*4 U_DOMAINNAME: .equ 65*5 # offset into the results returned by the sysinfo syscall S_TOTALRAM: .equ 16 # Sycscalls SYSCALL_EXIT: .equ 1 SYSCALL_READ: .equ 3 SYSCALL_WRITE: .equ 4 SYSCALL_OPEN: .equ 5 SYSCALL_CLOSE: .equ 6 SYSCALL_SYSINFO: .equ 116 SYSCALL_UNAME: .equ 59 # STDIN: .equ 0 STDOUT: .equ 1 STDERR: .equ 2 .globl _start _start: #========================= # PRINT LOGO #========================= # LZSS decompression algorithm implementation # by Stephan Walter 2002, based on LZSS.C by Haruhiko Okumura 1989 # optimized some more by Vince Weaver # we used to fill the buffer with FREQUENT_CHAR # but, that only gains us one byte of space in the lzss image. # the lzss algorithm does automatic RLE... pretty clever # so we compress with NUL as FREQUENT_CHAR and it is pre-done for us ldil L'ver_string,%r21 ; store beginning of .data seg ldo R'ver_string(%r21),%r21 ; in r21 to avoid 2 load instructions ; each time ldil L'ascii_buff,%r22 ; store beginning of .bss seg ldo R'ascii_buff(%r22),%r22 ; in r22 to avoid 2 load instructions ldi (N-F),%r8 ; R addi (logo-ver_string),%r21,%r9 ; %r9 points to logo addi (logo_end-ver_string),%r21,%r12 ; %r12 points to logo_end addi (text_buf-ascii_buff),%r22,%r27 ; %r27 points to text_buf ldil L'out_buffer,%r16 ldo R'out_buffer(%r16),%r16 ; point %r16 to out_buffer ; too far to add immediate copy %r16,%r17 ; copy to %r17 for output ldi 0xff,%r28 ldi 0x1fe0,%r29 ; annoying way to get 0xff00 into r29 shladd %r29,3,%r0,%r29 decompression_loop: ldb 0(%r9),%r10 ; load in a byte addi 1,%r9,%r9 ; increment source pointer copy %r10, %r11 ; move in the flags or %r11,%r29,%r11 ; re-load top as a hacky 8-bit counter test_flags: cmpb,= %r12,%r9, done_logo ; have we reached the end? ; if so, exit ldi 1,%r13 ; check load bit (no andi instr!) and %r13,%r11,%r13 cmpib,<> 0,%r13,discrete_char ; if set, we jump to discrete char ; BRANCH_DELAY shrpw %r0,%r11,1,%r11 ; shift bottom bit into carry flag offset_length: ; PA-RISC doesn't like unaligned ldh ldb 1(%r9),%r24 ldb 0(%r9),%r10 depw %r24,23,8,%r10 ; combine into a 16-bit value copy %r10,%r24 ; copy r10 to r24 addi 2,%r9,%r9 ; get match_length and match_position shrpw %r0,%r10,P_BITS,%r15 addi THRESHOLD+1,%r15,%r15 ; r15 = (r10 >> P_BITS) + THRESHOLD + 1 ; (=match_length) output_loop: depwi,z POSITION_MASK,23,8,%r13 ; %r13=POSITION_MASK<<8 addi 0xff,%r13,%r13 ; %r13+=0xff and %r24,%r13,%r24 ; mask it ldb %r27(%r24),%r10 ; load byte from text_buf[] addi 1,%r24,%r24 ; advance pointer in text_buf store_byte: stb %r10,0(%r17) addi 1,%r17,%r17 ; store it add %r27,%r8,%r26 stb %r10,0(%r26) ; store also to text_buf[r] addi 1,%r8,%r8 ; r++ ldi (N-1),%r13 addi -1,%r15,%r15 ; decrement count cmpib,<> 0,%r15,output_loop ; repeat until k>j ;BRANCH DELAY SLOT and %r13,%r8,%r8 ; mask r and %r11,%r29,%r13 ; if 0 we shifted through 8 and must cmpib,<> 0,%r13,test_flags ; re-load flags nop ; BRANCH DELAY SLOT b decompression_loop ; no nop needed as following ; BRANCH DELAY SLOT ; instruction is harmless discrete_char: ldb 0(%r9),%r10 addi 1,%r9,%r9 ; load a byte b store_byte ; and store it ; BRANCH DELAY SLOT ldi 1,%r15 ; want one char # end of LZSS code done_logo: bl write_stdout,%r2 ; print the logo ; BRANCH DELAY SLOT copy %r16,%r25 #========================== # PRINT VERSION #========================== addi (uname_info-ascii_buff),%r22,%r26 ; destination of uname copy %r26,%r15 ; save uname_info struct addr ble 0x100(%sr2,%r0) ; syscall ; BRANCH DELAY SLOT ldi SYSCALL_UNAME,%r20 ; uname syscall copy %r16, %r17 ; restore buffer offset pointer bl strcat,%r2 ; BRANCH DELAY SLOT addi U_SYSNAME,%r15,%r25 ; os-name from uname "Linux" bl strcat,%r2 ; call strcat ; BRANCH DELAY SLOT addi (ver_string-ver_string),%r21,%r25 ; source is " Version " bl strcat,%r2 ; call strcat ; BRANCH DELAY SLOT addi U_RELEASE,%r15,%r25 ; version from uname ie "2.4.1" bl strcat,%r2 ; call strcat ; BRANCH DELAY SLOT addi (compiled_string-ver_string),%r21,%r25 ; source is ", Compiled " bl strcat,%r2 ; call strcat ; BRANCH DELAY SLOT addi U_VERSION,%r15,%r25 ; compiled date bl center_and_print,%r2 ; center and print nop ; branch delay #=============================== # Middle-Line #=============================== copy %r16,%r17 ; restore output_buffer #========= # Load /proc/cpuinfo into buffer #========= addi (cpuinfo-ver_string),%r21,%r26 ; '/proc/cpuinfo' ldi 0,%r25 ; 0 = O_RDONLY ldi 0,%r24 ; ble 0x100(%sr2,%r0) ; syscall. ldi SYSCALL_OPEN, %r20 ; syscall set in branch delay slot ; fd in %r20? ; we should check that ; return %r20>=0 copy %r28,%r14 ; save the resulting fd copy %r14,%r26 ; set fd to arg0 ldil L'disk_buffer,%r25 ; point to disk buffer ldo R'disk_buffer(%r25),%r25 ldi 4096,%r24 ; 4096 is maximum size of proc file ;) ble 0x100(%sr2,%r0) ; syscall. ldi SYSCALL_READ, %r20 ; syscall set in branch delay slot copy %r14,%r26 ; set fd to arg0 ble 0x100(%sr2,%r0) ; syscall. ldi SYSCALL_CLOSE, %r20 ; syscall set in branch delay slot #============= # Number of CPUs #============= number_of_cpus: # we cheat here and just assume 1. # besides, I don't have a SMP PA-RISC machine to test on bl strcat,%r2 ; BRANCH DELAY SLOT addi (one-ver_string),%r21,%r25 ; print "One" #========= # MHz #========= print_mhz: ldil L'((0x20<<24)+(0x4d<<16)+(0x48<<8)+0x7a) ,%r26 ldo R'((0x20<<24)+(0x4d<<16)+(0x48<<8)+0x7a)(%r26) ,%r26 ; find " MHz" and grab up to . bl find_string,%r2 ; BRANCH DELAY SLOT ldi 0x2e,%r24 ; 0x2e is ascii for . bl strcat,%r2 ; strcat ; BRANCH DELAY SLOT addi (MHz-ver_string),%r21,%r25 ; bogo total follows RAM #========= # Chip Name #========= chip_name: ldil L'((0x63<<24)+(0x70<<16)+(0x75<<8)+0x9) ,%r26 ldo R'((0x63<<24)+(0x70<<16)+(0x75<<8)+0x9)(%r26) ,%r26 ; find "cpu\t" and grab up to ' ' bl find_string,%r2 ; BRANCH DELAY SLOT ldi 0x20,%r24 ; 0x20 is ascii for ' ' bl strcat,%r2 ; strcat ; BRANCH DELAY SLOT addi (processor-ver_string),%r21,%r25 ; "Processor, " #======== # RAM #======== addi (sysinfo_buff-ascii_buff),%r22,%r26 ble 0x100(%sr2,%r0) ; syscall. ldi SYSCALL_SYSINFO, %r20 ; syscall set in branch delay slot ldw S_TOTALRAM(%r26),%r26 ; size in bytes of RAM shrpw %r0,%r26,20,%r6 ; divide by 1024*1024 to get M bl num_to_ascii,%r2 ; BRANCH DELAY SLOT ldi 1,%r9 ; print to buffer bl strcat,%r2 ; call strcat ; BRANCH DELAY SLOT addi (ram_comma-ver_string),%r21,%r25 ; print 'M RAM, ' #======== # Bogomips #======== ldil L'((0x6d<<24)+(0x69<<16)+(0x70<<8)+0x73) ,%r26 ldo R'((0x6d<<24)+(0x69<<16)+(0x70<<8)+0x73)(%r26) ,%r26 ; find 'mips\t: ' and grab up to \n bl find_string,%r2 ; BRANCH DELAY SLOT ldi 0xa,%r24 ; 0xa is \n in ascii bl strcat,%r2 ; strcat ; BRANCH DELAY SLOT addi (bogo_total-ver_string),%r21,%r25 ; bogo total follows RAM bl center_and_print,%r2 ; center and print nop #================================= # Print Host Name #================================= copy %r17,%r16 ; copy s0 to s1 (output_buf_offset) bl strcat,%r2 ; call strcat ; BRANCH DELAY SLOT addi U_NODENAME,%r15,%r25 ; host name from uname() bl center_and_print,%r2 ; center and print nop bl write_stdout,%r2 ; BRANCH DELAY SLOT addi (default_colors-ver_string),%r21,%r25 ; pointer to default_colors #================================ # Exit #================================ exit: ldi 0, %r26 ; put exit code in arg0 ble 0x100(%sr2,%r0) ; syscall. 100 is linux gateway ldi SYSCALL_EXIT, %r20 ; syscall in branch delay slot #================================= # FIND_STRING #================================= # %r24 is char to end at # %r26 is 4-char ascii string to look for # %r17 is the output buffer # %r9,%r10,%r11,%r12 are destroyed find_string: ldil L'(disk_buffer-1),%r10 ; look in cpuinfo buffer ldo R'(disk_buffer-1)(%r10),%r10 ldi 0x3a,%r12 ; 0x3a is ':' ldi 0x0,%r11 find_loop: ; complicated load/shift ; PA-RISC doesn't like unaligned ; 32-bit loads depw %r11,23,24,%r11 ldb 1(%r10),%r9 depw %r9,31,8,%r11 cmpib,= 0,%r11,done ; are we at EOF? ; LOAD_DELAY_SLOT addi 1,%r10,%r10 ; increment pointer cmpb,<> %r26,%r11,find_loop ; do the strings match? nop ; if not, loop ; if we get this far, we matched find_colon: ldb 1(%r10),%r11 ; repeat till we find colon cmpib,= 0,%r11,done ; not found? then done ; LOAD DELAY SLOT addi 1,%r10,%r10 cmpb,<> %r12,%r11,find_colon ; is it a colon? nop ; if not, loop addi 2,%r10,%r10 ; skip a char [should be space] store_loop: ldb 0(%r10),%r11 ; load value cmpib,= 0,%r11,done ; off end, then stop ; LOAD_DELAY SLOT addi 1,%r10,%r10 ; increment cmpb,= %r11,%r24,done ; is it end char? nop ; if so, finish stb %r11,0(%r17) ; if not store and continue bl store_loop,%r0 ; loop ; LOAD DELAY SLOT addi 1,%r17,%r17 ; increment output pointer done: bv,n %r0(%r2) ; return ; branch delay is nullified #================================ # strcat #================================ # output_buffer_offset = %r17 # string to cat = %r25 # destroys %r18 strcat: ldb 0(%r25),%r18 ; load byte from string cmpib,= 0,%r18,done_strcat ; if zero, we are done ; BRANCH DELAY SLOT stb %r18,0(%r17) ; store byte to output_buffer addi 1,%r25,%r25 ; increment string bl strcat,%r0 ; loop ; BRANCH DELAY SLOT addi 1,%r17,%r17 ; increment output_buffer done_strcat: bv,n %r0(%r2) ; return ; branch delay is nullified #============================== # center_and_print #============================== # string is in %r16 output_buffer # %r4,%r5 clobbered # %r9= stdout or strcat center_and_print: copy %r2,%r3 ; save return address sub %r17,%r16,%r4 ; subtract end pointer from start ; (cheaty way to get size of string) ldi 80,%r5 cmpb,> %r4,%r5,done_center ; don't center if > 80 ; BRANCH DELAY SLOT ldi 0,%r9 ; print to stdout sub %r5,%r4,%r4 ; 80 - length shrpw %r0,%r4,1,%r4 ; divide by two bl write_stdout,%r2 ; BRANCH DELAY SLOT addi (escape-ver_string),%r21,%r25 ; print escape char bl num_to_ascii,%r2 ; print number of spaces ; BRANCH DELAY SLOT copy %r4,%r6 bl write_stdout,%r2 ; BRANCH DELAY SLOT addi (c-ver_string),%r21,%r25 ; print "C" done_center: bl write_stdout,%r2 ; BRANCH DELAY SLOT copy %r16, %r25 ; point to the string to print addi (linefeed-ver_string),%r21,%r25 ; print linefeed at end of line copy %r3,%r2 ; restore saved pointer ; so we'll return to ; where we were called from ; at the end of the write_stdout #================================ # WRITE_STDOUT #================================ # r25 (arg1) has string # r18, r19 destroyed write_stdout: ldi STDOUT, %r26 ; 1 in arg0 (stdout) ldi 0,%r24 ; 0 (count) in arg2 copy %r25,%r18 ; copy string pointer str_loop1: addi 1,%r18,%r18 ; increment pointer ldb 0(%r18),%r19 ; load byte at r18 cmpib,<> 0,%r19,str_loop1 ; if r19 not zero, loop addi 1,%r24,%r24 ; BRDELAY: increment arg2 ble 0x100(%sr2,%r0) ; syscall. 100 is linux gateway ldi SYSCALL_WRITE, %r20 ; syscall in branch delay slot bv,n %r0(%r2) ; return ; branch delay is nullified ############################## # num_to_ascii ############################## # %r6 = value to print # %r5 = output buffer # %r9 = 0=stdout, 1=strcat # destroys t2 ($10) # destroys t3 ($11) # destroys a0 ($4) num_to_ascii: addi ((ascii_buff-ascii_buff)+10),%r22,%r5 ; point to end of ascii_buffer copy %r2,%r7 ; save return value div_by_10: addi -1,%r5,%r5 ; point back one copy %r6,%r26 bl div_uint,%r2 ; div by 10, result in %r28 ; BRANCH DELAY SLOT ldi 0xa,%r25 ; dividing by 10 copy %r6,%r26 bl mod_uint,%r2 ; mod by 10, result in %r29 ; BRANCH DELAY SLOT ldi 0xa,%r25 ; modding by 10 addi 0x30,%r29,%r29 ; convert to ascii copy %r28,%r6 ; move old result into next divide cmpib,<> 0,%r28,div_by_10 ; BRANCH DELAY SLOT stb %r29,0(%r5) ; store to buffer write_out: copy %r5,%r25 cmpib,= 0,%r9,write_stdout ; print to stdout if r9==0 ; BRANCH DELAY SLOT copy %r7,%r2 ; restore return address b strcat ; strcat will return for us ; BRANCH DELAY SLOT nop # # Divide and mod code based on code from the gcc compiler # ##################### # unsigned int divide ##################### # %r26 = dividend # %r25 = divisor # %r28 = quotient # %r1,%r23 = trashed div_uint: ldil 0x80000 ,%r23 ; load 1<<31 into %r23 ldo -1(%r25),%r1 subi 0,%r25,%r1 ; clear carry, negate the divisor ds %r0,%r1,%r0 ; set V-bit to 1 add %r26,%r26,%r28 ; shift msb bit into carry ds %r0,%r25,%r1 ; 1st divide step, if no carry ; typically you unroll this, but we are ; going for size, not speed div_uint_loop: addc %r28,%r28,%r28 ; shift %r28 with/into carry shrpw %r0,%r23,1,%r23 ; shift does not mess with carry, cmpib,<> 0,%r23,div_uint_loop ; add, sub, etc do ; BRANCH DELAY ds %r1,%r25,%r1 ; divide step bv %r0(%r2) ; return addc %r28,%r28,%r28 ; shift last retreg bit ##################### # unsigned int divide ##################### # %r26 = dividend # %r25 = divisor # %r29 = remainder # %r1,%r23 = trashed mod_uint: ldil 0x80000 ,%r23 ; load 1<<31 into %r23 ldo -1(%r25),%r1 subi 0,%r25,%r29 ; clear carry, negate the divisor */ ds %r0,%r29,%r0 ; set V-bit to 1 add %r26,%r26,%r1 ; shift msb bit into carry ds %r0,%r25,%r29 ; 1st divide step, if no carry mod_uint_loop: addc %r1,%r1,%r1 ; shift %r29 with/into carry shrpw %r0,%r23,1,%r23 ; shift does not mess with carry, cmpib,<> 0,%r23,mod_uint_loop ; add, sub, etc do ; BRANCH DELAY SLOT ds %r29,%r25,%r29 ; divide step comiclr,<= 0,%r29,%r0 add %r29,%r25,%r29 ; correction bv %r0(%r2) ; return nop ;=========================================================================== ; section .data ;=========================================================================== .data ver_string: .ascii " Version \0" compiled_string: .ascii ", Compiled \0" ram_comma: .ascii "M RAM, \0" bogo_total: .ascii " Bogomips Total\0" linefeed: .ascii "\n\0" default_colors: .ascii "\033[0m\n\n\0" escape: .ascii "\033[\0" c: .ascii "C\0" cpuinfo: .ascii "/proc/cpuinfo\0" one: .ascii "One \0" MHz: .ascii "MHz PA-RISC \0" processor: .ascii " Processor, \0" .include "logo.lzss_new.parisc" ;============================================================================ ; section .bss ;============================================================================ ;.bss .lcomm ascii_buff,10 ; 32 bit can't be > 9 chars ; see /usr/src/linux/include/linux/kernel.h .lcomm sysinfo_buff,(64) .lcomm uname_info,(65*6) .lcomm text_buf, (N+F-1) .lcomm disk_buffer,4096 ; we cheat!!!! .lcomm out_buffer, 16384