// // linux_logo in ia64 assembler 0.24 // // by Vince Weaver // // assemble with "as -o ll.o ll.ia64.s" // link with "ld -o ll ll.o" .include "logo.include" // ASSEMBLY INFO // Comment is the C++ comment character // by default, automatic mode which bundles instructions for you // you can enable explicit mode with ".explicit" // you specify end of instruction group with ;; // an instruction group is a set of instructions with no dependencies // a bundle has 3 instructions, each 41 bits long, plus a 5-bit field // you can indicate a bundle by hand using { } brackets to include // with a .specifier to indicate what insn type (For example .mii) // not all instructions can be bundled together. // Long immediate and jump instructions can take 2 instructions // in an instruction group, all instructions in a group run at same time. // reg RAW and WAW dependencies are not allowed. WAR are allowed (usually) // mem RAW and WAW dependencies are allowed. accesses happen in prog order // exceptions: RAW - subsequent branches can see changes made to pr regs // a few other exceptions, not really relevant to ll // instructions like {p1}mov r1 =r4 // compare instructions can set predicates // Instructions in bundles. Up to 3 instructions per bundle // can jump to x86 code with "jmpe" // can do "virtual register allocation" to assign var names to strings // can annotate branches with guessed targets and probabilities // 128 int / 128fp registers, 64 predicate registers, 8 branch regs // register stack // register rotation // 128 int registers, GR0-GR127. in x86 mode you get GR8-GR31 // registers are 64-bit plus extra "NaT" (not a Thing) bit // GR0-GR31 always visible. // GR0 is a zero register. Always reads 0, writes cause a fault // r1 = gp (global pointer) // r2-r3 = scratch (usable with 22-bit add) // r4-r7 are saved registers // r8 = struct/union pointer // r8-r11 = return values // r12=stack pointer (must be 128-bit alligned) // r13=thread pointer // GR32-127 are stacked registers. // 128 fp registers, FR0-FR127 // FR0-FR31 always visible. // FR0 always reads as 0.0, FR1 always reads as 1.0. can't read them // 64 predicate registers, PR0-PR63 (1 bit each) // PR0-PR15 always available. PR0 is always 1 // PR16-PR63 are rotating (for loop unrolling) // 8 branch registers, BR0-BR7 (br0 also known as rp) // 8 Kernel registers KR0-7 (AR0-7). Can be read in userspace but not written // LC (AR65) is a loop counting register // Many other registers that don't matter to us in ll // normal looking add instruction. Extra add/sub instructions that // can additionall add/sub a "1" as well // There is a special add with 22-bit immediate that can only add gr0,1,2 // shladd cann also shift by 1,2,4 before add // integer multiply uses FP unit? // special 32-bit addition instruction addp // various shift and extract instructions. Funnel shift? // movl can load a 64-bit immediate, taking two instruction slots // cmp, cmp4 (32-bit), tbit, tnat, etc do compares and set two predicate bits // normall the two bits are set, one is == the other is != // also "unc" unconditional like normal but init to zero if not being set // also and, or, demorgan which allow multiple compares in one instruct? // cmp.ne p1,p2=a,r0 means p1= a!=0, p2= a==r0 (?) // loads/stores. // Can be either little/big-endian, but set at program start time // can do lots of weird things, like speculative loads, and spill inst // special spill instruction to handle NaT in registers // loads should be aligned // "advanced" loads let you move loads in front of stores even // if you don't know if they overlap yet. // use the "chk" instruction to be sure speculative load worked // post-increment loads ld8 r2=[r1],1 // Branches. Branches don't have to be the last insn in a bundle // all manner of branch prediction hints, plus special loop unrolling stuff // branch registers can hold the destination of a call // you specify branch predictor info with branches. // .spnt. (statically predict not taken) .sptk (taken) // .dpnt (dynamically predict not taken) .dptk (taken) // .few - prefetch only a few instrs at destination // .many - prefetch many instrs at destination // String instructions: // czx finds first zero byte in a reg // loop instruction: special LC loop count reg and br.cloop // register stack // use "alloc" instruction to allocate register stack // use like "alloc loc0=ar.pfs,w,x,y,z // the old instruction windows stored to loc0 (restore on exit) // w = input regs (which are same as old out regs) // x = local regs // y = output regs (up to 8 for passing params) // z = rotating registers // allocated registers are placed as r32-r?? // syscall is "break.i 0x10000" instruction, with syscall # in r15 // and inputs in out0...out? .equ STDIN,0 .equ STDOUT,1 .equ STDERR,2 // offsets into the results returned by the uname syscall .equ U_SYSNAME,0 .equ U_NODENAME,65 .equ U_RELEASE,65*2 .equ U_VERSION,(65*3) .equ U_MACHINE,(65*4) .equ U_DOMAINNAME,65*5 .equ S_TOTALRAM,32 // syscall numbers .equ SYSCALL_EXIT,1025 .equ SYSCALL_READ,1026 .equ SYSCALL_WRITE,1027 .equ SYSCALL_CLOSE,1029 .equ SYSCALL_OPEN,1028 .equ SYSCALL_SYSINFO,1127 .equ SYSCALL_UNAME,1130 .globl _start _start: alloc loc0 = ar.pfs,4,32,6,0 // 4 inputs, 32 locals, 6 outputs movl r13=out_buffer // point output to buffer //========================= // PRINT LOGO //========================= // LZSS decompression algorithm implementation // by Stephan Walter 2002, based on LZSS.C by Haruhiko Okumura 1989 // optimized some more by Vince Weaver mov loc1=r13 // buffer we are printing to mov loc2=(N-F) // R movl loc3=logo // beginning of logo data movl loc8=logo_end // end of logo data movl loc9=text_buf // point to text_buf decompression_loop: ld1 loc4=[loc3],1 // load logo byte, auto-increment mov loc5=0xff00 or loc5=loc5,loc4 // set top 8 bits as a pseudo-count test_flags: cmp.ge p2,p3=loc3,loc8 // have we reached the end? (p2) br.cond.dpnt done_logo // if so, branch to end tbit.nz p4,p5=loc5,0 shr.u loc5=loc5,1 // shift right (p4) br.cond.dptk discrete_char // if shifted out a 1, discrete char offset_length: ld1 loc12=[loc3],1 // load a byte, inc ld1 loc4=[loc3],1 // load a byte, inc // we have ot merge into one 16-bit // value, as we have no unaligned loads shl loc13=loc4,8 // shift left or loc4=loc13,loc12 // and put on top mov loc7=loc4 // move into r7. we mask this later mov loc12=(THRESHOLD+1) shr loc13=loc4,P_BITS add loc6=loc12,loc13 // loc6= (loc4>>P_BITS)+THRESHOLD+1 // = match_length output_loop: mov loc12=((POSITION_MASK<<8)+0xff) and loc7=loc7,loc12 // mask add loc12=loc7,loc9 ld1 loc4=[loc12] // load byte from text_buf[loc2] add loc7=1,loc7 // incrememnt pointer store_byte: st1 [loc1]=loc4,1 // store byte and auto-inc add loc12=loc9,loc2 st1 [loc12]=loc4 // store byte to text[loc2] add loc2=1,loc2 // increment loc2 mov loc12=(N-1) and loc2=loc2,loc12 // mask loc2 to not be too big add loc6=-1,loc6 // decrement count cmp.ne p2,p3=0,loc6 (p2) br.cond.dptk output_loop // if not zero, loop mov r12=0x100 cmp.lt p4,p5=r12,loc5 // see if top bits set (p4) br.cond.dptk test_flags // if not, reload flags br decompression_loop discrete_char: ld1 loc4=[loc3],1 // load byte and auto-inc mov loc6=1 // set to store single byte br store_byte // store it done_logo: mov out0=r13 br.call.sptk.few b4=write_stdout //========================== // PRINT VERSION //========================== first_line: mov r4=r13 // out_buffer mov r15=SYSCALL_UNAME // uname syscall movl out0=uname_info // uname struct mov loc4=out0 // copy pointer break.i 0x100000 // call syscall add out0=U_SYSNAME,loc4 // os-name from uname "Linux" br.call.sptk.few b5=strcat movl out0=ver_string // source is " Version " br.call.sptk.few b5=strcat add out0=U_RELEASE,loc4 // version from uname "2.4.1" br.call.sptk.few b5=strcat movl out0=compiled_string // source is ", Compiled " br.call.sptk.few b5=strcat add out0=U_VERSION,loc4 // compiled date br.call.sptk.few b5=strcat movl out0=linefeed br.call.sptk.few b5=strcat // "\n" { // force not to be in a bundle with another call br.call.sptk.few b0=center_and_print } //=============================== // Middle-Line //=============================== middle_line: mov r4=r13 // restore output pointer //========= // Load /proc/cpuinfo into buffer //========= mov r15=SYSCALL_OPEN // open() movl out0=cpuinfo // '/proc/cpuinfo' mov out1=0 // O_RDONLY break.i 0x100000 // syscall. fd returned in r8? mov r15=SYSCALL_READ // read mov out0=r8 // copy fd movl out1=disk_buffer mov out2=4096 // 4096 is upper-limit of a procfile break.i 0x100000 // syscall mov r15=SYSCALL_CLOSE // fd still in out0 break.i 0x100000 // syscall //============= // Number of CPUs //============= mov out0='I' // count how many times IPS appears mov out1='P' mov out2='S' br.call.sptk b1=count_string cmp.lt p2,p3=5,r8 // cap cpus at "5" = many (p2) mov r8=5 movl loc2=array // get string pointers movl loc3=one add loc4=loc2,r8 // add offset in array add loc4=-1,loc4 // to beginning of list ld1 loc4=[loc4] add out0=loc4,loc3 br.call.sptk.few b5=strcat //========= // MHz //========= mov out0='M' // find 'cycl' and grab after ':' mov out1='H' mov out2='z' mov out3='.' br.call.sptk.many b1=find_string movl out0=megahertz // print 'MHz ' br.call.sptk.many b5=strcat //========= // Chip Name //========= mov out0='i' // find 'ily' and grab after : mov out1='l' mov out2='y' mov out3='\n' br.call.sptk.many b1=find_string movl out0=ram_comma // print ', ' add out0=5,out0 br.call.sptk.many b5=strcat //======== // RAM //======== mov r15=SYSCALL_SYSINFO // sysinfo() syscall movl out0=sysinfo_buff break.i 0x100000 add loc2=S_TOTALRAM,out0 ld8 loc3=[loc2] shr out1=loc3,20 mov out0=1 br.call.sptk.many b1=num_to_ascii movl out0=ram_comma // print 'M RAM, ' br.call.sptk.many b5=strcat //======== // Bogomips //======== mov out0='I' // find 'IPS' and grab up to \n mov out1='P' mov out2='S' mov out3='\n' br.call.sptk.many b1=find_string movl out0=bogo_total br.call.sptk.many b5=strcat { br.call.sptk.few b0=center_and_print } //================================= // Print Host Name //================================= last_line: mov r4=r13 // out_buffer movl loc1=uname_info // uname struct add out0=U_NODENAME,loc1 // print node name { br.call.sptk.few b5=strcat // gas incorrectly puts two calls in same bundle // so we are forcing this one to be by itself } br.call.sptk.few b0=center_and_print movl out0=default_colors // restore default colors br.call.sptk.few b4=write_stdout //================================ // Exit //================================ exit: mov out0=0 // 0 exit value mov r15=SYSCALL_EXIT // load syscall number break.i 0x100000 //================================= // Divide // yes this is an awful algorithm, but simple // and uses few registers //================================= // in0 =numerator in1=denominator // r8=quotient r9=remainder // b3=return divide: alloc loc0=ar.pfs,2,8,6,0 // setup register frame mov r8=r0 // zero out result setf.sig f3=in0 // itanium uses fp multiply setf.sig f4=in1 divide_loop: setf.sig f8=r8 xma.l f7=f8,f4,f0 // multiply temp by denominator add r8=1,r8 // incrememnt quotient getf.sig loc1=f7 // get result cmp.le p8,p9=loc1,in0 // is it greater than numerator? (p8) br.cond.dptk divide_loop // if not, increment temp add r8=-2,r8 // otherwise went too far, decrement // and done setf.sig f8=r8 xma.l f7=f8,f4,f0 // calculate remainder getf.sig loc1=f7 sub r9=in0,loc1 // R=N-(Q*D) mov ar.pfs=loc0 // restore register file br.ret.sptk.many b3 // return //================================= // FIND_STRING //================================= // in3 is char to end at // in0,in1,in2 are 3-char ascii string to look for // b1=return find_string: alloc loc0=ar.pfs,4,8,4,0 movl loc1=disk_buffer // look in cpuinfo buffer find_loop: ld1 loc2=[loc1],1 // watch for first char cmp.ne p6,p7=loc2,in0 (p6) br.cond.dptk find_loop ld1 loc2=[loc1] // watch for second char cmp.ne p6,p7=loc2,in1 (p6) br.cond.dptk find_loop add loc3=1,loc1 ld1 loc2=[loc3] // watch for third char cmp.ne p6,p7=loc2,in2 (p6) br.cond.dptk find_loop cmp.eq p8,p9=0,loc2 // see if off the end (p8) br.cond.spnt done // if so, exit // if we get this far, we matched find_colon: ld1 loc2=[loc1],1 // repeat till we find colon cmp.eq p8,p9=0,loc2 // escape if hit a null (p8) br.cond.dptk done cmp.ne p10,p11=':',loc2 (p10) br.cond.dptk find_colon add loc1=1,loc1 // skip a char [should be space] store_loop: ld1 loc2=[loc1],1 cmp.eq p6,p7=0,loc2 // is it a null? (p6) br.cond.dptk done cmp.eq p8,p9=in3,loc2 // is it end string? (p8) br.cond.dptk almost_done // if so, finish st1 [r4]=loc2,1 // if not store and continue br store_loop almost_done: st1 [r4]=r0 // replace last value with null done: mov ar.pfs=loc0 br.ret.sptk.many b1 //================================= // COUNT_STRING //================================= // in0,in1,in2 are 3-char ascii string to look for // r8 points at result // b1=return count_string: alloc loc0=ar.pfs,4,8,4,0 mov r8=0 movl loc1=disk_buffer // look in cpuinfo buffer count_loop: ld1 loc2=[loc1],1 // watch for first char cmp.eq p8,p9=0,loc2 // see if off the end (p8) br.cond.spnt done_counting // if so, exit cmp.ne p6,p7=loc2,in0 (p6) br.cond.dptk count_loop ld1 loc2=[loc1] // watch for second char cmp.ne p6,p7=loc2,in1 (p6) br.cond.dptk count_loop add loc3=1,loc1 ld1 loc2=[loc3] // watch for third char cmp.ne p6,p7=loc2,in2 (p6) br.cond.dptk count_loop // if we get this far, we matched add r8=1,r8 br count_loop done_counting: mov ar.pfs=loc0 br.ret.sptk.many b1 //================================ // write_stdout //================================ // string to output in in0 // return in b4 write_stdout: alloc loc0=ar.pfs,1,2,6,0 // setup reg stack mov out1=in0 // copy string to print mov out2=0 // set length to 0 string_loop: ld1 loc1=[in0],1 // load byte, auto inc cmp.ne p1,p2=loc1,r0 // is it zero? add out2=1,out2 // increment count (p1) br.cond.dptk string_loop // if not zero, loop add out2=-1,out2 // fix count to not include 0 mov out0=STDOUT // stdout mov r15=SYSCALL_WRITE // write_syscall break.i 0x100000 // syscall mov ar.pfs=loc0 // restore register stack br.ret.sptk.many b4 // return //================================ // strcat //================================ // in0 = "source" // r4 = "destination" // b5 = return pointer strcat: alloc loc0=ar.pfs,1,8,6,0 // setup register frame strcat_loop: ld1 loc1=[in0],1 // load and incrememnt pointer st1 [r4]=loc1,1 // store and increment pointer cmp.ne p6,p7=0,loc1 (p6) br.cond.dptk strcat_loop // is it zero? if not, loop add r4=-1,r4 // back up pointer to the zero mov ar.pfs=loc0 // restore register frame br.ret.sptk.many b5 // return //============================== // center_and_print //============================== // out_buffer has string to print // return address in b0 center_and_print: alloc loc0=ar.pfs,4,8,6,0 movl out0=escape // print escape br.call.sptk.few b4=write_stdout str_loop2: mov loc1=r13 // out_buffer sub loc2=r4,loc1 // find size of string by subtracting mov loc3=80 // size of screen+1 sub loc2=loc3,loc2 // get size cmp.lt p6,p7=loc2,r0 (p6) br.cond.dpnt done_center // if negative, don't center shr out1=loc2,1 // divide by two mov out0=0 br.call.sptk.few b1=num_to_ascii movl out0=C // print trailing C br.call.sptk.few b4=write_stdout mov out0=r13 // out_buffer br.call.sptk.few b4=write_stdout done_center: mov ar.pfs=loc0 br.ret.sptk.many b0 //=========================== // num_to_ascii //=========================== // in0=0=stdout 1=stcat // in1=number // b1 num_to_ascii: alloc loc0=ar.pfs,2,8,6,0 movl loc1=ascii_buffer // point to buffer add loc2=9,loc1 // point to end of buffer mov out1=10 // divide by 10 mov out0=in1 // move in number to divide num_loop: br.call.sptk.few b3=divide after_div: add r9=0x30,r9 // convert to ascii st1 [loc1]=r9,-1 // store to output mov out0=r8 // move quotient back in cmp.ne p6,p7=0,r8 // see if done (p6) br.cond.dptk num_loop add loc1=1,loc1 // fix pointer cmp.eq p6,p7=0,in0 mov out0=loc1 (p6) br.call.sptk b4=write_stdout (p7) br.call.sptk b5=strcat num_all_done: mov ar.pfs=loc0 // restore reg stack br.ret.sptk b1 // return //=========================================================================== //.data //=========================================================================== ver_string: .asciz " Version " compiled_string: .asciz ", Compiled " linefeed: .asciz "\n" cpuinfo: .asciz "/proc/cpuinfo" megahertz: .asciz "MHz " processor: .asciz " Processor, " ram_comma: .asciz "M RAM, " bogo_total: .asciz " Bogomips Total\n" default_colors: .asciz "\033[0m\n\n" escape: .asciz "\033[" C: .asciz "C" array: .byte 0,5,10,17,23 one: .asciz "One " two: .asciz "Two " three: .asciz "Three " four: .asciz "Four " many: .asciz "Many " .include "logo.lzss_new" //============================================================================ //.bss //============================================================================ // includes alignment after size bss_begin: .lcomm uname_info, (65*6),4 .lcomm sysinfo_buff, (64),4 .lcomm ascii_buffer, 10,4 .lcomm text_buf, (N+F-1),4 .lcomm disk_buffer, 4096,4 // we cheat!!!! .lcomm out_buffer, 16384,4 // see /usr/src/linux/include/linux/kernel.h