# # linux_logo in m68k assembler 0.25 # # Originally by # Vince Weaver # # Crazy size-optimization hacks by # Stephan Walter # # assemble with "as -o ll.o ll.m68k.s" # link with "ld -o ll ll.o" | From the m68k programming manual: | 16 general purpose 32-bit registers, d7-d0, a7-a0 | 32-bit program counter, 8-bit condition code register | stack pointer is a7, frame pointer is a6 | condition codes are Carry, oVerflow, Zero, Negative, eXtend | address registers cannot be used for byte-sized operations | bigendian architecture | instructions are variable length, from 1 to 11 16-bit words | From google searches: | Syscalls: | trap #0 instruction. Syscall num on d0, arguments in d1-d? | gleaned from the as info file | comment character is | (need --bitwise-or if need to use it for or) | m68k assemly developed at MIT? compatible with Sun assembler | %a0-%a7 = gp regs. %pc , %zpc (zero address regard to pc) | %za0-%za7 (suppressed address reg?) | Crazy number of addressing modes (18): | Immediate: $NUM | Data reg direct: %d0-%d7 | Address reg direct: %a0-%a7 %a7=stack pointer, %a6=frame pointer | Addr reg indirect: %a0@ through %a7@ | Addr reg postincrement: %a0@+ - %a7@+ | Addr reg postdecrement: %a0@- - %a7@- | Indirect plus offset: %a0@(NUMBER) | Index: %a0@(NUMBER,REGISTER:SIZE:SCALE) | Postindex: %a0(NUMBER)@(ONUMBER,REGISTER:SIZE:SCALE) | Preindex: APC@(NUMBER,REGISTER:SIZE:SCALE)@(ONUMBER) | Absolute: SYMBOL or DIGITS? | | There is an alternate morotolla syntax that gas can also handle | exg = exchange, lea=load effecive address, | link= update stack frame | move | N and Z updated. | can move to and from the CCR register | move16 = moves 16-byte block | movem stores a list of registers to memory | movep = output to 8-bit peripheral | moveq = move quick (load an 8-bit value) | pea = push immediate on stack | unlk reverse of link | add (works on d regs), adda (works on a regs) | addi = immediate, addq = immediate value of 1-8 | addx = add two regs plus the X condition code | clr (set to zero) cmp,cmpi,cmpa (compare), cmpm (compare memory) | cmp2 (compare if in range?) like chk but sets CCR, doesn't trap | divs/divu | chk = check register against bound, trap. chk2 compare between two bounds | ext,extb sign extend | muls,mulu - multiply (signed/unsigned) | neg=negate,negx=substrat 0-dest-X CCR | sub,suba,subi,subq,subx | and,andi,eor,eori,not,or,ori | andi,eori,ori can have CCR as destination | asl,asr=arithmatic shift | shifted out values goes to both C and X | lsl,lsr=logical shift | shifted out goes to C and X, 0 is shifted in. shift value can be imm or r | rol,ror = rotate. the rotated off bit also goes into C | roxl roxr rotate through X | swap = swap high and low words | bchg,bclr,bset,btst = bit mabipulation options | bchg tests a bit and sets the Z flag, then inverts the bit?! | bcl rests a bit and sets the Z flag, then clears the bit | bset tests a bit and sets the z flag and sets the bit | btst just tests the bit and sets Z | bfchg,bfclr,bfexts,bfextu,bfffo,bfins,bfset,bfst = bitfield manipulation | bfchg is like bchg, just N is also set if the high bit is set | bfexts exracts bit field and sign extends (bfextu is same, zero extended) | bfffo finds first one in a bit field | bfins inserts a bit field | bfset set N and C based on bitfield of ea, and sets the bits | bftst like above but no setting | abcd,nbcd,pack,sbcd,unpk = bcd instructions | CC = one of CC,CS (carry clear/set) EQ NE GE GT LE LT (comparison) | HI,LS (low), MI (minus), PL (plus) VC, VS (overflow clear/set) | bCC = branch | dbCC = check condition. If true, nothing. If not-true, | decrement counter and brach (a loop instruction) | ie, DBMI=decrememnt and loop until minus | in addition to normal flags, also F and T (false and true) | sCC = if condition code than 1s to result, else 0s to result | bra=branch always, bsr=branch subroutine | bsr pushes return value on stack | jmp,jsr=push return address on stack | nop = also forces all pending bus transactions to complete | rtd = returns (pulls pc off stack) and adds displacement to stack | rtr = return and pull CCR from stack | rts = just pulls pc from stack | assembly has two operations, ie add source,destination | many others for fp, system mode (cache invalidation,mmu,smp,etc) | Optimization | + 1078 - First working version | + 1062 - Make all strcat calls use %a5 | + 1038 - Make out_buffer be stored in %a6 | + 1038 - Use the "dbf" instruction in a loop (saves 2 words) | + 1030 - (note, only goes down 4 bytes at a time) | Make sure to use addq instead of addi, don't recopy the uname pointer | + 1014 - Change the printing code to use linear %a1 instead of re-loading | each time .include "logo.include" | offsets into the results returned by the uname syscall .equ U_SYSNAME,0 .equ U_NODENAME,65 .equ U_RELEASE,65*2 .equ U_VERSION,(65*3) .equ U_MACHINE,(65*4) .equ U_DOMAINNAME,65*5 | offset into the results returned by the sysinfo syscall .equ S_TOTALRAM,16 | Sycscalls .equ SYSCALL_EXIT, 1 .equ SYSCALL_READ, 3 .equ SYSCALL_WRITE, 4 .equ SYSCALL_OPEN, 5 .equ SYSCALL_CLOSE, 6 .equ SYSCALL_SYSINFO, 116 .equ SYSCALL_UNAME, 122 | .equ STDIN,0 .equ STDOUT,1 .equ STDERR,2 .globl _start _start: |========================= | PRINT LOGO |========================= | LZSS decompression algorithm implementation | by Stephan Walter 2002, based on LZSS.C by Haruhiko Okumura 1989 | optimized some more by Vince Weaver move.l #out_buffer,%a6 | buffer we are printing to move.l %a6,%a1 move.l #(N-F),%d2 | R move.l #(logo),%a3 | a3 points to logo data move.l #(logo_end),%a4 | a4 points to logo end move.l #text_buf,%a5 | r5 points to text buf decompression_loop: clr.l %d5 | clear the %d5 register move.b %a3@+,%d5 | load a byte, increment pointer or.w #0xff00,%d5 | load top as a hackish 8-bit counter test_flags: cmp.l %a4,%a3 | have we reached the end? bge done_logo | if so, exit lsr #1,%d5 | shift bottom bit into carry flag bcs discrete_char | if set, we jump to discrete char offset_length: clr.l %d4 move.b %a3@+,%d0 | load 16-bits, increment pointer move.b %a3@+,%d4 | do it in 2 steps because our data is little-endian :( lsl.l #8,%d4 move.b %d0,%d4 move.l %d4,%d6 | copy d4 to d6 | no need to mask d6, as we do it | by default in output_loop moveq.l #P_BITS,%d0 lsr.l %d0,%d4 move.l #(THRESHOLD+1),%d0 add.l %d0,%d4 add %d4,%d1 | d1 = (d4 >> P_BITS) + THRESHOLD + 1 | (=match_length) output_loop: andi #((POSITION_MASK<<8)+0xff),%d6 | mask it move.b %a5@(0,%d6),%d4 | load byte from text_buf[] addq #1,%d6 | advance pointer in text_buf store_byte: move.b %d4,%a1@+ | store a byte, increment pointer move.b %d4,%a5@(0,%d2) | store a byte to text_buf[r] add #1,%d2 | r++ andi #(N-1),%d2 | mask r dbf %d1,output_loop | decrement count and loop | if %d1 is zero or above bftst %d5,16:8 | are the top bits 0? bne test_flags | if not, re-load flags jmp decompression_loop discrete_char: move.b %a3@+,%d4 | load a byte, increment pointer clr.l %d1 | we set d1 to zero which on m68k | means do the loop once jmp store_byte | and store it | end of LZSS code done_logo: move.l %a6,%a3 | out_buffer we are printing to bsr write_stdout | print the logo optimizations: | Optimization setup move.l #strcat,%a5 | load strcat address into %a5 |========================== | PRINT VERSION |========================== first_line: move.l #uname_info,%d1 | uname struct moveq.l #SYSCALL_UNAME,%d0 trap #0 | do syscall move.l %d1,%a1 | os-name from uname "Linux" move.l %a6,%a2 | point %a2 to out_buffer jsr (%a5) | call strcat move.l #ver_string,%a1 | source is " Version " jsr (%a5) | call strcat move.l %a1,%a4 move.l #((uname_info)+U_RELEASE),%a1 | version from uname, ie "2.6.20" jsr (%a5) | call strcat move.l %a4,%a1 | source is ", Compiled " jsr (%a5) | call strcat move.l %a1,%a4 move.l #((uname_info)+U_VERSION),%a1 | compiled date jsr (%a5) | call strcat move.l %a4,%a1 move.b #0xa,%a2@+ | store a linefeed, increment pointer move.b #0,%a2@+ | NUL terminate, increment pointer bsr center_and_print | center and print |=============================== | Middle-Line |=============================== middle_line: |========= | Load /proc/cpuinfo into buffer |========= move.l %a6,%a2 | point a2 to out_buffer move.l #(cpuinfo),%d1 | '/proc/cpuinfo' movq.l #0,%d2 | 0 = O_RDONLY movq.l #SYSCALL_OPEN,%d0 trap #0 | syscall. return in d0? move.l %d0,%d5 | save our fd move.l %d0,%d1 | move fd to right place move.l #disk_buffer,%d2 move.l #4096,%d3 | 4096 is maximum size of proc file ;) move.l #SYSCALL_READ,%d0 trap #0 move.l %d5,%d1 move.l #SYSCALL_CLOSE,%d0 trap #0 | close (to be correct) |============= | Number of CPUs |============= number_of_cpus: | cheat. Who has an SMP arm? jsr (%a5) |========= | MHz |========= print_mhz: move.l #(('i'<<24)+('n'<<16)+('g'<<8)+':'),%d0 bsr find_string | find 'ing:' and grab up to '\n' move.b #' ',%a2@+ | put in a space |========= | Chip Name |========= chip_name: move.l #(('C'<<24)+('P'<<16)+('U'<<8)+':'),%d0 bsr find_string | find 'CPU:' and grab up to '\n' | print " Processor, " jsr (%a5) |======== | RAM |======== move.l #(sysinfo_buff),%d1 move.l %d1,%a0 | copy moveq.l #SYSCALL_SYSINFO,%d0 trap #0 | sysinfo() syscall move.l %a0@(S_TOTALRAM),%d1 | size in bytes of RAM moveq.l #20,%d3 lsr.l %d3,%d1 | divide by 1024*1024 to get M | adc r3,r3,#0 | round moveq.l #1,%d0 move.l %a1,%a4 bsr num_to_ascii move.l %a4,%a1 | print 'M RAM, ' jsr (%a5) | call strcat |======== | Bogomips |======== move.l #(('i'<<24)+('p'<<16)+('s'<<8)+':'),%d0 bsr find_string | find 'ips:' and grab up to '\n' jsr (%a5) | print bogomips total bsr center_and_print | center and print |================================= | Print Host Name |================================= last_line: move.l %a6,%a2 | point a2 to out_buffer move.l #((uname_info)+U_NODENAME),%a1 | host name from uname() jsr (%a5) | call strcat bsr center_and_print | center and print move.l #(default_colors),%a3 | restore colors, print a few linefeeds bsr write_stdout |================================ | Exit |================================ exit: moveq.l #0,%d1 | return a 0 moveq.l #SYSCALL_EXIT,%d0 trap #0 | and exit |================================= | FIND_STRING |================================= | %d0 = string to find find_string: move.l #(disk_buffer-1),%a3 | look in cpuinfo buffer find_loop: lea %a3@(1),%a3 | add one to pointer move.l %a3@,%d1 | load an unaligned word beq done | if off the end, finish cmp.l %d1,%d0 bne find_loop lea %a3@(4),%a3 | skip what we just searched skip_tabs: move.b %a3@+,%d3 | read in a byte cmp.b #'\t',%d3 | are we a tab? beq skip_tabs | if so, loop lea %a3@(-1),%a3 | adjust pointer store_loop: move.b %a3@+,%d3 | load a byte, increment pointer move.b %d3,%a2@+ | store a byte, increment pointer cmp.b #'\n',%d3 bne store_loop almost_done: move.l #0,%d7 move.b %d7,%a2@- | replace last value with NUL done: rts | return |================================ | strcat |================================ | value to cat in a1 | output buffer in a2 | d3 trashed strcat: move.b %a1@+,%d3 | load a byte, increment pointer move.b %d3,%a2@+ | store a byte, increment pointer bne strcat | loop if not zero sub #1,%a2 | point to one less than null rts | return |============================== | center_and_print |============================== | string to center in at output_buffer center_and_print: move.l #(escape),%a3 | we want to output ^[[ bsr write_stdout move.l %a6,%a3 | point %a3 to out_buffer suba.l %a3,%a2 | get length by subtracting | a2 = a2-a3 move.l %a2,%d3 move.l #81,%d1 sub.l %d3,%d1 | subtract! d1=d1-d3 | we use 81 to not count ending \n bmi done_center | if result negative, don't center lsr #1,%d1 | divide by 2 | addx.l %d1,#0 | round? move.l #0,%d0 | print to stdout bsr num_to_ascii | print number of spaces move.l #(C),%a3 | we want to output C bsr write_stdout move.l %a6,%a3 done_center: |================================ | WRITE_STDOUT |================================ | a3 has string | d0,d1,d2,d3 trashed write_stdout: moveq.l #0,%d3 | clear count str_loop1: add #1,%d3 move.b %a3@(0,%d3),%d2 bne str_loop1 | repeat till zero write_stdout_we_know_size: move.l %a3,%d2 moveq.l #STDOUT,%d1 | print to stdout moveq.l #SYSCALL_WRITE,%d0 | load the write syscall trap #0 | actually run syscall rts | return |############################# | num_to_ascii |############################# | d1 = value to print | d0 = 0=stdout, 1=strcat num_to_ascii: move.l #(ascii_buffer+10),%a3 | point to end of our buffer div_by_10: divu.w #10,%d1 | divide by 10. Q in lower, R in upper bfextu %d1,0:16,%d2 | copy remainder to %d2 bfextu %d1,16:16,%d1 | mask out quotient into %d1 | bl divide @ Q=r7,$0, R=r8,$1 add #0x30,%d2 | convert to ascii move.b %d2,%a3@- | store a byte, decrement pointer cmp #0,%d1 | bne div_by_10 | if Q not zero, loop write_out: cmp #0,%d0 beq ascii_stdout move.l %a3,%a1 jmp (%a5) | if 1, strcat ascii_stdout: jmp write_stdout | else, fallthrough to stdout #=========================================================================== # section .data #=========================================================================== .data data_begin: ver_string: .ascii " Version \0" compiled_string: .ascii ", Compiled \0" one: .ascii "One \0" processor: .ascii " Processor, \0" ram_comma: .ascii "M RAM, \0" bogo_total: .ascii " Bogomips Total\n\0" default_colors: .ascii "\033[0m\n\n\0" escape: .ascii "\033[\0" C: .ascii "C\0" cpuinfo: .ascii "/proc/cpuinfo\0" .include "logo.lzss_new" #============================================================================ # section .bss #============================================================================ .bss bss_begin: .lcomm uname_info,(65*6) .lcomm sysinfo_buff,(64) .lcomm ascii_buffer,10 .lcomm text_buf, (N+F-1) .lcomm disk_buffer,4096 | we cheat!!!! .lcomm out_buffer,16384 # see /usr/src/linux/include/linux/kernel.h