! ! linux_logo in SH3 assembler 0.30 ! ! By ! Vince Weaver ! ! assemble with "as -o ll.o ll.sh3.s" ! link with "ld -o ll ll.o" ! I have to cross-compile, so what I do is was more like ! make CROSS=sh3-linux- ARCH=sh3 ! RISC ! 32-bit, 16-bit instructions. 16 gp registers, banked ! delayed branches. Also has DSP built-in. ! SH-3E has floating point built in. ! Bottom 8 registers are banked based on processor mode, ! but bank0 is always used in user space ! R0 has special meaning in some addressing modes ! GBR register = for I/O accesses? ! MACH/MACL multiply registers ! PR (Procedure Register) ! PC - Program counter ! can be either big or little endian ! T bit in condition codes set on compare instruction ! S bit used in 64-bit multiply ! addressing modes ! direct register rn ! indirect register @rn ! post-increment indirect @rn+ ! pre-decrement indirect -@rn ! indirect w dispalcement @(disp:4,rn) [4 bit displacemnt] ! indirect indexed @(r0,rn) [has to be r0] ! ind GBR displacement @(disp:8,GBR) [8 bit displacement] ! ind indexed GBR @(R0,GBR) [has to be r0] ! ind PC w displacement @(disp:8,PC) ! PC relative disp [either 8 or 12 bits, or R0] ! immediate #val [tst,and,or,xor=zero extended, ! mov,add,comp=sign extended, ! quadupled for trapa] ! Instructions ! ADD Rm,Rn = Rm+Rn->Rn ! ADDC = add carry (T is carry reg) ADDV = add with overflow ! AND (full or 8-bit zero extend) AND.B (8-bit and w mem at GBR) ! BF (branch if false, no delay slot) BF/S (branch if false) ! BRA (branch always) BRAF (branch far [pc+reg]) ! BSR (branch to subroutine, PC->PR) ! BSRF (branch to subroutine far [pc+reg]) ! BT (branch if true, no delay slot) BT/S (branch if true) ! CLRMAC (clear multiply/accum reg) CLRS (clear S bit) ! CLRT (clear T bit) ! CMP/EQ CMP/GE CMP/GT CMP/HI CMP/HS CMP/PL CMP/PZ CMP/STR ! equal, greater than or equal, greater than ! higher than (ie unsigned gt), higher or same, ! positive, positive or zero, byte in rn=byte in rm ! only immediate compate is with CMP/EQ ! sets T flag ! DIV0S (divide step 0 signed) DIV0U (div step 0 unsigned) ! DIV1 (divide step 1) ! DMULS.L (do 32x32 multiply and store in MACL and MACH) ! DMULU.L (do unsigned 32x32 mult and store in MACL/MACH) ! DT (decrement and test. subtrace one, compare 0, set T) ! EXTS (extend as signed) EXTU (extend as unsigned) ! JMP (jump) JSR (jump subroutine) ! LDS, LDS.L (load to system reg). On plain sh3 only MACH and MACL targets ! MAC.L (multiply and accumulate. two 32-bit vals read from mem and ! multipled, then added to current MAC registers) ! MAC,MAC.W (two 16-bit vals read from mem and multiplied and added to MAC) ! MOV, MOV.B, MOV.W, MOV.L (move) ! MOVA (move effective address) PC+disp -> R0 ! MOVT (T bit stored in REG, zero extended) ! MUL.L (two 32-bit values multiplied, bottom 32bits only stored in MAC) ! MULS.W (two 16-bit signed values multiplied, stored in MAC) ! MULU.W (16-bit unsigned mul) ! NEG (negate), NEGC (negate w carry, used for negating > 32bit num) ! NOP (nop) NOT (1's complement) ! OR, OR.B (logical OR) ! PREF (prefetch data to cache) ! ROTCL (rotate with carry, left) ROTCR (rotate with carry, right) ! ROTL (rotate [shift] left) ROTR (rotate [shift] right) ! RTS (return from subroutine) [PR->PC] ! SETS (set S flag) SETT (set T flag) ! SHAD (Shift arithmatically dynamically) [left and right arith shift] ! SHAL (shift left by one bit, moving T in) ! SHAR (shift right arith by one bit, shifting into T) ! SHLD (shift left dynamically) [meaning amount in reg] ! SHLL (shift left locgically, by one bit) ! SHLL2, SHLL8, SHLL16 (shift left by 2, 8, or 16 bits) ! SHLR (shift right logically, by one bit) ! SHLR2, SHLR8, SHLR16 (shift right by 2,8 ,or 16 bits) ! SUB (subtract) SUBC (subtract with carry) ! SUBV (subtract with overflow) ! SWAP.B SWAP.W (swap bytes) ! TAS (test and set) [for SMP] ! TRAPA (trap always) [system call] ! TST (test) ! XOR, XOR.B (exclusive or) ! XTRCT (extract middle 32 bits fro two registers paired) ! Syscalls ! number in r3, args in r4,r5,r6,r7,r0,r1 ! use the "trapa" instruction. possibly 0x10+num_arguments ! result returned in r0 ! r15 is stack pointer, r14 is frame pointer ! Optimizing ! + 1169 - Original working version ! + 1061 - Remove debugging hooks ! + 1049 - optimize/remove branch delay slots in lzss code ! + 1025 - optimize/remove branch delay slots in sysinfo code ! + 1021 - convert find_string into a loop ! + 1001 - optimize all but num_to_ascii ! + 994 - optimize num_to ascii. Wish I could put the 16 divide ! instructions in a loop somehow, that's 32 bytes there. .include "logo.include" ! offsets into the results returned by the uname syscall .equ U_SYSNAME,0 .equ U_NODENAME,65 .equ U_RELEASE,65*2 .equ U_VERSION,(65*3) .equ U_MACHINE,(65*4) .equ U_DOMAINNAME,65*5 ! offset into the results returned by the sysinfo syscall .equ S_TOTALRAM,16 ! Sycscalls .equ SYSCALL_EXIT, 1 .equ SYSCALL_READ, 3 .equ SYSCALL_WRITE, 4 .equ SYSCALL_OPEN, 5 .equ SYSCALL_CLOSE, 6 .equ SYSCALL_SYSINFO, 116 .equ SYSCALL_UNAME, 122 ! .equ STDIN,0 .equ STDOUT,1 .equ STDERR,2 .globl _start _start: !========================= ! PRINT LOGO !========================= ! LZSS decompression algorithm implementation ! by Stephan Walter 2002, based on LZSS.C by Haruhiko Okumura 1989 ! optimized some more by Vince Weaver mov.l @(out_addr-.,pc),r8 ! point r8 to out_buffer mov r8,r10 ! copy for later mov.l @(R-.,pc),r0 ! r is in r0 mov.l @(logo_addr-.,pc),r3 ! r3 points to logo data mov.l @(logo_end_addr-.,pc),r7! r7 points to logo end mov.l @(text_addr-.,pc),r9 ! r9 points to text buf decompression_loop: mov #8,r5 ! shift count mov.b @r3+,r1 ! load a byte, increment pointer test_flags: cmp/ge r7,r3 ! have we reached the end? bt done_logo ! if so, exit ! bt has no delay slot shlr r1 ! shift bottom bit into carry flag bt discrete_char ! if set, we jump to discrete char ! bt has no delay slot offset_length: mov.b @r3+,r12 ! load a byte, increment pointer extu.b r12,r12 mov.b @r3+,r4 ! load a byte, increment pointer ! we can't load halfword as no unaligned loads extu.b r4,r4 shll8 r4 or r4,r12 ! merge back into 16 bits ! this has match_length and match_position ! no need to mask r12, as we do it ! by default in output_loop mov r12,r6 ! r6 will hold the count shlr8 r6 ! assuming P_BITS is 10 shlr2 r6 ! have to shift in 2 parts add #(THRESHOLD+1),r6 ! r6 = (r4 >> P_BITS) + THRESHOLD + 1 ! (=match_length) output_loop: mov.l @(pos_mask-.,pc),r13 ! urgh, can't handle simple constants and r13,r12 ! mask it mov r9,r13 add r12,r13 mov.b @r13,r4 ! load byte from text_buf[r12] add #1,r12 ! increment pointer store_byte: mov.b r4,@r10 ! store a byte add #1,r10 ! increment pointer mov.b r4,@(r0,r9) ! store a byte to text_buf[r] add #1,r0 ! r++ mov.w @(n_val-.,pc),r2 add #-1,r2 ! grrr no way to get this easier and r2,r0 ! mask r dt r6 ! decement count bf output_loop ! repeat until k>j ! bf has no branch delay slot dt r5 ! dt--. If dt=0, T=1 bf test_flags ! if not, re-load flags ! bf has no branch delay slot bra decompression_loop ! loop nop ! wasted branch delay discrete_char: mov.b @r3+,r4 ! load a byte, increment pointer bra store_byte ! and store it (and delay slot) mov #1,r6 ! we set r6 to one so byte ! will be output once ! end of LZSS code done_logo: bsr write_stdout ! print the logo mov r8,r5 ! buffer we are printing to (br delay) !========================== ! PRINT VERSION !========================== first_line: mov r8,r10 ! copy in output pointer mov #SYSCALL_UNAME,r3 ! uname number mov.l @(uname_addr-.,pc),r4 ! uname struct trapa #0x11 ! do syscall bsr strcat_r4 ! os-name from uname "Linux" mov r4,r11 ! save pnter for later (br delay) bsr strcat_r9 ! call strcat mov.l @(data_addr-.,pc),r9 ! source is " Version " (br delay) add #(U_RELEASE/2),r11 ! too big to fit in 8bit offset mov r11,r3 ! save pointer for later add #(U_RELEASE/2),r3 bsr strcat_r4 ! call strcat mov r3,r4 ! version from uname, "2.6.20" (brdly) bsr strcat_r9 ! source is ", Compiled " nop ! delay slot (wasted) add #(U_VERSION/3),r3 ! point to compiled date bsr strcat_r4 ! call strcat mov r3,r4 ! point to compiled data (br delay) bsr strcat_r9 ! source is "\n" nop ! delay slot (wasted) bsr center_and_print ! center and print nop ! branch delay slot (wasted) !=============================== ! Middle-Line !=============================== middle_line: !========= ! Load /proc/cpuinfo into buffer !========= mov r8,r10 ! restore output pointer mov #SYSCALL_OPEN,r3 mov.l @(cpu_addr-.,pc),r4 ! uname struct ! '/proc/cpuinfo' mov #0,r5 ! 0 = O_RDONLY trapa #0x12 ! open() fd returned in r0 mov r0,r1 ! save our fd mov #SYSCALL_READ,r3 mov r1,r4 ! move in fd mov.l @(disk_addr-.,pc),r5 ! load pointer to disk address mov #16,r6 shll8 r6 ! load 4096 (16<<8) ! 4096 is maximum size of proc file ;) trapa #0x13 ! read() mov #SYSCALL_CLOSE,r3 mov r1,r4 ! restore fd trapa #0x11 ! close (to be correct) !============= ! Number of CPUs !============= number_of_cpus: bsr strcat_r9 ! assume no SMP, print "One" nop ! branch delay slot (wasted) !========= ! MHz !========= print_mhz: bsr find_string ! call find_string mov.l @(mhz_search-.,pc),r4 ! look for "MHz" (br delay) bsr strcat_r9 ! print MHz nop ! branch delay slot (wasted) !========= ! Chip Name !========= chip_name: bsr find_string ! call find_string mov.l @(family_search-.,pc),r4 ! find "mily" bsr strcat_r9 ! print " Processor, " nop ! branch delay (wasted) !======== ! RAM !======== mov #SYSCALL_SYSINFO,r3 ! sysinfo() syscall mov.l @(sys_addr-.,pc),r4 ! sysinfo addr trapa #0x11 mov.l @(S_TOTALRAM,r4),r4 ! size in bytes of RAM shlr16 r4 ! divide by 1024*1024 to get M shlr2 r4 ! we shift by 16,2,2 shlr2 r4 ! because sh can't do >>20 direct bsr num_to_ascii ! convert to ASCII mov #1,r3 ! use strcat bsr strcat_r9 ! print 'M RAM, ' nop ! branch delay (wasted) !========= ! Bogomips !========= bsr find_string ! call find_string mov.l @(bogo_search-.,pc),r4 ! look for 'mips' (br delay) bsr strcat_r9 ! print bogomips total nop ! branch delay (wasted) bsr center_and_print ! center and print nop ! branch delay (wasted) !================================= ! Print Host Name !================================= last_line: mov r8,r10 ! restore out_buffer bsr strcat_r4 ! call strcat mov r11,r4 ! restore uname ptr (branch delay) bsr center_and_print ! center and print nop ! branch delay (wasted) bsr write_stdout ! restore colors, print a few linefeeds mov r9,r5 ! point to linefeeds (branch delay) !================================ ! Exit !================================ exit: mov #SYSCALL_EXIT,r3 mov #0,r4 ! result is zero trapa #0x11 ! and exit !======================================= ! Constants used by the logo code ! (have to be within 256 bytes of use) !======================================= .align 2 out_addr: .long out_buffer logo_addr: .long logo logo_end_addr: .long logo_end text_addr: .long text_buf pos_mask: .long ((POSITION_MASK<<8)+0xff) R: .long N-F .align 1 n_val: .word N !================================= ! FIND_STRING !================================= ! r3=char to stop at ! r4=string to find ! r5 trashed find_string: mov.l @(disk_addr-.,pc),r5 ! load pointer to disk buffer find_loop: ! this would be easier with unaligned loads ! but SH doesn't support them ! (this is trickier because qemu-sh4 on x86 does) ! wish we could use ! mov.l @r5,r0 ! turn into a loop? xor r1,r1 ! clear r1 mov #0,r0 ! clear r0 looper: mov.b @(r0,r5),r14 ! load byte shll8 r1 ! shift old part or r14,r1 ! or new part in add #1,r0 ! increment pointer cmp/eq #4,r0 ! have we done 4? bf looper ! if not loop xor r0,r0 ! was our result 0? cmp/eq r1,r0 bt done ! if so, off end, done cmp/eq r4,r1 ! compare to our string bf/s find_loop ! if no match, loop add #1,r5 ! incrememnt pointer (br delay) find_colon: mov.b @r5+,r0 ! load a byte, increment pointer cmp/eq #':',r0 bf/s find_colon ! repeat till we find colon nop ! branch delay slot (wasted) add #1,r5 ! skip the space store_loop: mov.b @r5+,r0 ! load a byte, increment pointer mov.b r0,@r10 ! store a byte add #1,r10 ! increment pointer cmp/eq #'\n',r0 ! is it linefeed? bf/s store_loop ! if not loop nop ! branch delay slot xor r0,r0 ! clear r0 done: rts ! return mov.b r0,@-r10 ! replace last value with NUL (brdly) !================================ ! strcat_r9 !================================ ! value to cat in r9 ! output pointer in r10 ! r0 trashed strcat_r9: mov.b @r9+,r0 ! load a byte, increment pointer mov.b r0,@r10 ! store a byte add #1,r10 ! increment pointer cmp/eq #0,r0 ! is it zero? bf strcat_r9 ! if not loop ! bf has no delay slot rts ! return add #-1,r10 ! point to one less than null (brdlay) !================================ ! strcat_r4 !================================ ! value to cat in r4 ! output pointer in r10 ! r0 trashed strcat_r4: mov.b @r4+,r0 ! load a byte, increment pointer mov.b r0,@r10 ! store a byte add #1,r10 ! increment pointer cmp/eq #0,r0 ! is it zero? bf strcat_r4 ! if not loop ! bf has no branch delay rts ! return add #-1,r10 ! point to one less than null (brdly) !============================== ! center_and_print !============================== ! string to center in at output_buffer center_and_print: sts.l pr,@-r15 ! store return address on stack bsr write_stdout mov.l @(escape_addr-.,pc),r5 ! we want to output ^[[ str_loop2: mov r10,r0 ! point r0 to out pointer sub r8,r0 ! get length by subtracting mov #81,r4 ! we use 81 to not count ending \n subv r0,r4 ! subtract. T=1 on underflow bt done_center ! if result negative, don't center ! bt has no branch delay shlr r4 ! divide by 2 xor r0,r0 addc r0,r4 ! round bsr num_to_ascii ! print number of spaces mov #0,r3 ! print to stdout (branch delay) bsr write_stdout mov.l @(c_addr-.,pc),r5 ! we want to output C (brnch delay) done_center: lds.l @r15+,pr ! restore return address from stack mov r8,r5 ! point to output ! fall through to write_stdout !================================ ! WRITE_STDOUT !================================ ! string to write in r5 ! r0,r1,r2,r3,r4,r6 trashed write_stdout: mov #-1,r0 ! init count mov #0,r2 ! strlen: add #1,r0 ! increment count mov.b @(r0,r5),r1 ! load byte from memory cmp/eq r2,r1 ! compare against zero bf/s strlen ! loop mov r0,r6 ! branch delay slot mov #SYSCALL_WRITE,r3 ! syscall # mov #STDOUT,r4 ! write to stdout trapa #0x13 ! do the syscall rts ! return nop ! branch delay slot (wasted) !============================= ! num_to_ascii !============================= ! r4 = value to print ! r3 = 0=stdout, 1=strcat ! r2,r5,r6 trashed? num_to_ascii: mov.l @(ascii_addr-.,pc),r5 ! point to end of ascii buffer div_by_10: mov #10,r6 ! we'll be dividing by 10 shll16 r6 div0u ! set up for unsigned division mov r4,r2 divloop: div1 r6,r2 ! can't do this in a loop div1 r6,r2 ! because it depends on value of T div1 r6,r2 ! and can't compare for loop end div1 r6,r2 ! without setting T div1 r6,r2 div1 r6,r2 ! could we be clever with rotates div1 r6,r2 ! to save/restore the T book somehow? div1 r6,r2 div1 r6,r2 div1 r6,r2 div1 r6,r2 div1 r6,r2 div1 r6,r2 div1 r6,r2 div1 r6,r2 div1 r6,r2 rotcl r2 extu.w r2,r2 mov #10,r6 ! Q is in r2 mul.l r6,r2 ! multiply Q by 10, store in MACL sts macl,r0 ! move macl to r0 mov r4,r6 sub r0,r6 ! subtract, R is now in r6 add #0x30,r6 ! convert to ascii mov.b r6,@-r5 ! store a byte, decrement pointer mov r2,r4 ! move Q in for next divide xor r0,r0 cmp/eq r0,r4 bf div_by_10 ! if Q not zero, loop ! bf has no branch delay write_out: cmp/eq r3,r0 ! r0 should still be zero bt write_stdout ! if r3==0, goto stdout ! bt has no branch delay bra strcat_r4 ! if 1, strcat mov r5,r4 ! but string in right place (brdelay) .align 2 cpu_addr: .long cpuinfo data_addr: .long data_begin uname_addr: .long uname_info disk_addr: .long disk_buffer sys_addr: .long sysinfo_buff escape_addr: .long escape ascii_addr: .long (ascii_buffer+10) c_addr: .long C family_search: .long 'm'<<24+'i'<<16+'l'<<8+'y' mhz_search: .long 'u'<<24+' '<<16+'c'<<8+'l' bogo_search: .long 'm'<<24+'i'<<16+'p'<<8+'s' .align 1 !=========================================================================== ! section .data !=========================================================================== .data data_begin: ver_string: .ascii " Version \0" compiled_string: .ascii ", Compiled \0" linefeed: .ascii "\n\0" one: .ascii "One \0" mhz: .ascii " \0" processor: .ascii " Processor, \0" ram_comma: .ascii "M RAM, \0" bogo_total: .ascii " Bogomips Total\n\0" default_colors: .ascii "\033[0m\n\n\0" escape: .ascii "\033[\0" C: .ascii "C\0" cpuinfo: .ascii "/proc/cpuinfo\0" .include "logo.lzss_new" .align 2 !============================================================================ ! section .bss !============================================================================ !.bss bss_begin: .comm uname_info,(65*6) .comm sysinfo_buff,(64) .comm ascii_buffer,10 .comm text_buf, (N+F-1) .comm disk_buffer,4096 ! we cheat!!!! .comm out_buffer,16384 ! see /usr/src/linux/include/linux/kernel.h