# # linux_logo in crisv32 assembler 0.32 # # By # Vince Weaver # # assemble with "as -o ll.o ll.crisv32.s" # link with "ld -o ll ll.o" # # I have to cross-compile, so what I do is was more like # make CROSS=/usr/local/cris/crisv32-axis-linux-gnu/bin/ ARCH=crisv32 # # I use qemu for simulating this code (I have no cris hardware) # and qemu simulates the newer cris fs, that is crisv32 # instruction set. # Qemu gives a mmap error if the "-N" flag is passed to as # This can be worked around by commenting out the # "offset & ~TARGET ..." line in mmap.c in qemu/linux-user # The older Cris LX has a different instruction set (for example, # the PC is r15, among other differences) and might be able # to be optimized more. # # It'd be nice to have an Axis 88 development board... # # Little Endian # no alignment # 16-bit instructions (though immediate values can follow) # branch delay slots # 2 operand (genearlly Rs,Rd) # *not* load store? i.e can do add.b [Rs],Rd # 14 gp 32-bit registers (r0-r13) # 1 stack pointer (r14 "sp") # 1 address calc reg (r15 "acr") # 16 special purpose registers (p0-p15) # name pval bits desc # + bz - p0 - 8 zero byte constant # + vr - p1 - 8 version # + pid - p2 - 32 process id # + srs - p3 - 8 support reg select (pick reg bank) # + wz - p4 - 16 word zero constant # + exs - p5 - 32 exception status # + eda - p6 - 32 exception address # + mof - p7 - 32 multiply overflow # + dz - p8 - 32 dword zero constant # + ebp - p9 - 32 exception base # + erp - p10- 32 exception return # + srp - p11- 32 subroutine return # + nrp - p12- 32 nmi return # + ccs - p13- 32 cond code stack # + usp - p14- 32 user mode stack # + spc - p15- 32 single step pc # Flags # # Carry oVerflow Zero Negative eXtend # Interrupt User (P)SeqBroken RestoreP Singlestep # Q(pendingSingle) nMi # # Has room for 3 copies, HW shifts them when # exception occurs. # Condition codes # CC carry clear !C CS carry set C # NE not equal !Z EQ equal Z # VC not overflow !V VS overflow set V # PL plus !N MI minus N # LS low or same C||Z HI high !C && !Z # GE greater eq N&&V || !N&&!V # LT less than N&&!V || !N&&!V # LE less equal Z||N&&!V||!N&&V # A always # SB Sequence Broken P # N,Z set on all # N,Z,V,C set on ADD,ADDQ,ADDS,ADDU,ADDC # N,Z,V,C on CMP,CMPQ,CMPS,CMPU,NEG,SUB,SUBQ,SUBS,SUBU # N,Z,V on MULS,MULU # N,Z on BTST,BTSTQ # P set on restore from exception, cache miss cond write # C set if cond write fails # Addressing Modes # Quick Immediate - 6 bit, extended # Register - register # Indirect - reg pointer [R5] # Autoincrement - [R5+] # (cannot [ACR+] or [R15+]) # Immediate - follows immediately after insn # branches can have 8 or 16 bit offsets # no jumps or instructions with long immediates in delay slot # bas - branch and save. # takes 32 bit value, followed by delay slot # jasc/basc - gives you 4 unused bytes you can # use yourself before delay slot # address calc instrucions (addo) often used with acr # don't affect flags # lapc - relative address based on PC # the "ax" ins enables extended arithmetic # this makes next instruction taken into account # the C andother flags as if one big arith insn # instruction summary # ABS - absolute value # ADD.s - add (size = b,w,d), ADDC, ADDQ # ADDI - add index (scaled add) # ADDO - add offset (dest is ACR), ADDOQ (quick - 8bit imm) # ADDS - add with sign extend, ADDU zero extend # AX - enable extended arithmetic # CMP, CMPQ, CMPS (sign extend), CMPU # DSTEP - divide step # LAPC - load PC relative, LAPCQ # MCP - multiply carry propagation # MULS - signed multiply, MULU # NEG, NOT # SUB, SUBQ, SUBS, SUBU # logical # AND, ANDQ # BTST - bit test (N set to bit Rs, Z = if bits to right 0) # BTSTQ # OR, ORQ # TEST - compare with zero # XOR # shift # ASR,ASRQ, LSL, LSLQ, LSR, LSRQ # branch # BA # BAS - branch and save, BASC (with context), BSR, BSRC # Bcc - conditionally # JAS, JASC, JSR, JSRC, JUMP # RET # misc # BOUND - limit index to a maximum # CLEAR - set register to zero # CLEARF - clear flags # LZ - count leading zeros # MOVE - move register, MOVEQ, MOVS, MOVU # MOVEM - move memory into multiple registers # NOP # SCC - set conditional # SETF - set flags # SWAPs - Swap bits (inverting, reversing or swapping) # N (invert) W (swap words) B (swap bytes) R (reverse) # Calling convention # parms passed in R10-R13, return in R10 # r0-r8 callee saved, r9-r15 caller saved # differences between FS and LX # - pc is not r15 # - BDAP, BIAP, DIP prefixes removed (addr calc) # - MSTEP removed (MUL instructions added) # - MOVE to PC removed # - BOUND with memory operand removed # - MOVEM order changed # - NOP is not SETF instead of ADDI R0.b,R15 # - RET is not JUMP Ps # Optimization (the assembler pads to 32 byte boundary) # + 1267 bytes = Initial direct port of avr32 code # + 1107 bytes = Reduce constants to 16 bit when possible # + 1043 bytes = Fill branch delay slots # + 1011 bytes = loop the divide routine # + 979 bytes = subroutine jsr to reg instead of bsr for strcat # + 979 bytes = use "addi acr" instruction # + 947 bytes = use r8 for write_stdout jsr instead of bsr # + 947 bytes = remove all bsr instructions # + 947 bytes = consistently use r3 instead of loading out_buffer # + 915 bytes = miscellaneous fixes, enough to push under the barrier # + 913 bytes = remove unused variable # + 911 bytes = merge bogomips and linefeed strings # + 905 bytes = remove .section bss and .section data directives .include "logo.include" # offsets into the results returned by the uname syscall .equ U_SYSNAME,0 .equ U_NODENAME,65 .equ U_RELEASE,65*2 .equ U_VERSION,(65*3) .equ U_MACHINE,(65*4) .equ U_DOMAINNAME,65*5 # offset into the results returned by the sysinfo syscall .equ S_TOTALRAM,16 # Sycscalls .equ SYSCALL_EXIT, 1 .equ SYSCALL_READ, 3 .equ SYSCALL_WRITE, 4 .equ SYSCALL_OPEN, 5 .equ SYSCALL_CLOSE, 6 .equ SYSCALL_SYSINFO, 116 .equ SYSCALL_UNAME, 122 # .equ STDIN,0 .equ STDOUT,1 .equ STDERR,2 .globl _start _start: ;========================= ; PRINT LOGO ;========================= ; LZSS decompression algorithm implementation ; by Stephan Walter 2002, based on LZSS.C by Haruhiko Okumura 1989 ; optimized some more by Vince Weaver movu.w out_buffer,r1 ; r1 is out_buffer movu.w (N-F),r2 ; r2 is R (N-F) movu.w logo,r3 ; r3 points to logo movu.w logo_end,r4 ; r4 points to logo_end movu.w text_buf,r12 ; r12 points to text_buf decompression_loop: movu.b [r3+],r8 ; load a byte, increment pointer or.w 0xff00,r8 ; load top as a hackish 8-bit counter test_flags: cmp.d r3,r4 ; have we reached the end? beq done_logo ; if so, exit btstq 0,r8 ; [DS] check bit 0 bmi discrete_char ; if set, we jump to discrete char lsrq 1,r8 ; [DS] shift bottom bit into carry flag offset_length: movu.w [r3+],r10 ; load a halfword move.d r10,r11 ; copy r9 to r11 ; no need to mask r11, as we do it ; by default in output_loop lsrq P_BITS,r10 addq (THRESHOLD+1),r10 ; d1 = (d4 >> P_BITS) + THRESHOLD + 1 ; (=match_length) output_loop: and.w ((POSITION_MASK<<8)+0xff),r11 addi r12.b,r11,acr movu.b [acr],r9 ; load byte from text_buf[] addq 1,r11 ; advance pointer in text_buf store_byte: move.b r9,[r1+] ; store a byte, increment pointer addi r12.b,r2,acr move.b r9,[acr] ; store a byte to text_buf[r] addq 1,r2 ; r++ subq 1,r10 ; decrement count and loop bne output_loop ; if r10 is zero or above and.w (N-1),r2 ; [DS] mask r btstq 8,r8 ; test if bit 8 set (top 24 bits zero) ; one less instruction and one less ; temp reg than lz /cmpq bmi test_flags ; if not, re-load flags nop ; [DS] ba decompression_loop ; nop ; [DS] discrete_char: movu.b [r3+],r9 ; load a byte, increment pointer ba store_byte ; and store it moveq 1,r10 ; [DS] only output one byte ; end of LZSS code done_logo: movu.w write_stdout,r8 ; make r0 hold address of write_stdout movu.w out_buffer,r11 ; out_buffer we are printing to jsr r8 ; call write_stdout (print the logo) move.d r11,r3 ; [DS] save for later movu.w strcat,r13 ; make r13 hold strcat for jsr ; instead of bsr movu.w center_and_print,r5 movu.w num_to_ascii,r2 movu.w find_string,r7 ;========================== ; PRINT VERSION ;========================== first_line: movu.w uname_info,r10 ; uname struct move.d r10,r4 ; copy for later movu.b SYSCALL_UNAME,r9 break 13 ; do syscall ; os-name from uname "Linux" ; already at r10 jsr r13 ; call strcat movu.w r3,r1 ; [DS]point to out buffer ; r4 already points to Linux jsr r13 ; call strcat movu.w ver_string,r4 ; [DS] source is " Version " jsr r13 ; call strcat movu.w ((uname_info)+U_RELEASE),r4 ; [DS] version from uname, ie "2.6.20" ; can't use addoq, not enough ; bits (just barely) jsr r13 ; call strcat movu.w compiled_string,r4 ; [DS] source is ", Compiled " jsr r13 ; call strcat movu.w ((uname_info)+U_VERSION),r4 ; [DS] compiled date jsr r13 ; call strcat movu.w linefeed,r4 ; [DS] print a linefeed jsr r5 ; call center_and_print nop ;=============================== ; Middle-Line ;=============================== middle_line: ;========= ; Load /proc/cpuinfo into buffer ;========= move.d r3,r1 ; point to out_buffer movu.w cpuinfo,r10 ; '/proc/cpuinfo' clear.d r11 ; 0 = O_RDONLY moveq SYSCALL_OPEN,r9 break 13 ; syscall. return in r10 move.d r10,r6 ; save our fd movu.w disk_buffer,r11 movu.w 4096,r12 ; 4096 is maximum size of proc file ;) moveq SYSCALL_READ,r9 break 13 move.d r6,r10 ; restore fd moveq SYSCALL_CLOSE,r9 break 13 ; close (to be correct) ;============= ; Number of CPUs ;============= number_of_cpus: jsr r13 ; call strcat movu.w one,r4 ; [DS] cheat. assuming no smp cris machines ;========= ; MHz ;========= print_mhz: ; /proc/cpuinfo on crisv32 does not report megahertz ;========= ; Chip Name ;========= chip_name: jsr r7 ; call find_string move.d ('l'<<24)+('e'<<16)+('d'<<8)+'o',r10 ; [DS] look for "cpu model" jsr r13 ; call strcat movu.w processor,r4 ; [DS] print " Processor, " ;======== ; RAM ;======== ram: movu.w sysinfo_buff,r10 movu.b SYSCALL_SYSINFO,r9 break 13 ; sysinfo() syscall movu.w sysinfo_buff+S_TOTALRAM,acr ; size in bytes of RAM move.d [acr],r10 lsrq 20,r10 ; divide by 1024*1024 to get M jsr r2 ; call num_to_ascii moveq 1,r12 ; [DS] use strcat jsr r13 ; call strcat movu.w ram_comma,r4 ; [DS] print 'M RAM, ' ;======== ; Bogomips ;======== bogomips: jsr r7 ; call find_string move.d ('s'<<24)+('p'<<16)+('i'<<8)+'m',r10 ; [DS] find 'mips' and grab up to '\n' jsr r13 ; call strcat movu.w bogo_total,r4 ; [DS] print bogomips total jsr r5 ; call center_and_print nop ;================================= ; Print Host Name ;================================= last_line: move.d r3,r1 ; point r1 to out_buffer jsr r13 ; call strcat movu.w ((uname_info)+U_NODENAME),r4 ; [DS] host name from uname() jsr r5 ; call center_and_print nop jsr r8 ; call write_stdout movu.w default_colors,r11 ; restore colors, print a few linefeeds ;================================ ; Exit ;================================ exit: moveq SYSCALL_EXIT,r9 ; load syscall value clear.d r10 ; return a 0 break 13 ; exit ;================================= ; FIND_STRING ;================================= ; r10 = string to find find_string: movu.w disk_buffer,r6 ; look in cpuinfo buffer find_loop: move.d [r6],r0 ; load unaligned word beq done ; if zero, then not found addq 1,r6 ; [DS] increment pointer cmp.d r0,r10 bne find_loop ; loop until we find our string nop ; [DS] addq 3,r6 ; skip what we just searched skip_tabs: movu.b [r6+],r0 ; read in a byte cmp.b ' '+1,r0 ; are we whitespace ; (grrr, one bit too big for cmpq) blt skip_tabs ; if so, loop nop ; [DS] addq 1,r6 ; adjust pointer (skip colon) store_loop: movu.b [r6+],r0 ; load a byte, increment pointer cmpq '\n',r0 beq almost_done ; nop ; [DS] move.b r0,[r1+] ; store a byte, increment pointer ba store_loop ; almost_done: move bz,[r1] ; [DS] replace last value with NUL done: ret ; return nop ; [DS] ;============================== ; center_and_print ;============================== ; string to center in out_buffer center_and_print: move srp,r15 ; save link register jsr r8 ; call write_stdout movu.w escape,r11 ; [DS] we want to output ^[[ move.d r1,r10 ; save end of buffer move.d r3,r11 ; point r11 to out_buffer sub.d r3,r10 ; subtract neg.d r10,r10 adds.b 81,r10 ; (buf_begin-buf_end)+80 bmi done_center ; if result negative, don't center lsrq 1,r10 ; [DS] divide by 2 jsr r2 ; call num_to_ascii (print number of spaces) clear.d r12 ; [DS] print to stdout jsr r8 ; call write_stdout movu.w C,r11 ; [DS] we want to output C move.d r3,r11 ; point to out_buffer done_center: move r15,srp ; restore link register ;================================ ; WRITE_STDOUT ;================================ ; r11 has string ; r9,r10,r12 trashed write_stdout: move.d r11,r12 ; copy string pointer to r12 str_loop1: test.b [r12+] ; test if zero, auto-increment bne str_loop1 ; branch if not zero nop ; [DS] branch delay slot sub.d r11,r12 ; get count into r12 subq 1,r12 ; adjust for overcount write_stdout_we_know_size: moveq SYSCALL_WRITE,r9 ; load the write syscall moveq STDOUT,r10 ; print to stdout break 13 ; actually run syscall ret ; return ;============================= ; num_to_ascii ;============================= ; r10 = value to print ; r12 = 0=stdout, 1=strcat num_to_ascii: movu.w (ascii_buffer+9),r11 ; [DS] point to end of our buffer div_by_10: ; 16 bit div code orig from crisv32 manual moveq 10,r6 ; divide by 10 lslq 16,r6 subq 1,r6 ; Subtract one from the denominator. moveq 16,r0 div_loop: subq 1,r0 bne div_loop dstep r6,r10 ; [DS] Perform 16 interations ; q is in lower 16 bits ; r is in upper 16 bits move.d r10,r0 lsrq 16,r0 addq 48,r0 ; convert to ASCII subq 1,r11 ; decrement pointer move.b r0,[r11] ; store a byte movu.w r10,r10 ; mask off remainder bne div_by_10 ; if Q not zero, loop write_out: cmpq 0,r12 ; [DS] bne strcat move.d r11,r4 ; [DS] move for strcat ; (harmless for write_stdout) ascii_stdout: ba write_stdout ; else, branch to stdout ;================================ ; strcat (used to preserve linear string pointer) ;================================ ; value to cat in r4 ; output buffer in r1 ; r0 trashed strcat: movu.b [r4+],r6 ; [DS] load a byte, increment pointer bne strcat ; loop if not zero move.b r6,[r1+] ; [DS] store a byte, increment pointer ret ; return subq 1,r1 ; [DS] point to one less than null ;=========================================================================== ; section .data ;=========================================================================== ;.data ver_string: .ascii " Version \0" compiled_string: .ascii ", Compiled \0" one: .ascii "One CRIS \0" processor: .ascii " Processor, \0" ram_comma: .ascii "M RAM, \0" bogo_total: .ascii " Bogomips Total" linefeed: .ascii "\n\0" default_colors: .ascii "\033[0m\n\n\0" escape: .ascii "\033[\0" C: .ascii "C\0" .ifdef FAKE_PROC cpuinfo: .ascii "proc/cpu.cris\0" .else cpuinfo: .ascii "/proc/cpuinfo\0" .endif .include "logo.lzss_new" ;============================================================================ ; section .bss ;============================================================================ ;.section bss .lcomm uname_info,(65*6) .lcomm sysinfo_buff,(64) .lcomm text_buf, (N+F-1) .lcomm ascii_buffer,10 .lcomm disk_buffer,4096 ; we cheat!!!! .lcomm out_buffer,16384 ; see /usr/src/linux/include/linux/kernel.h