From 376729e1301f82a8f20ce78f36b7107c75720a7c Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 21 Nov 2004 10:36:25 +0000 Subject: RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core). --- crypto/perlasm/x86unix.pl | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'crypto/perlasm') diff --git a/crypto/perlasm/x86unix.pl b/crypto/perlasm/x86unix.pl index 7d87eb1701..867fa09e48 100644 --- a/crypto/perlasm/x86unix.pl +++ b/crypto/perlasm/x86unix.pl @@ -161,7 +161,7 @@ sub main'shl { &out2("sall",@_); } sub main'shr { &out2("shrl",@_); } sub main'xor { &out2("xorl",@_); } sub main'xorb { &out2("xorb",@_); } -sub main'add { &out2("addl",@_); } +sub main'add { &out2($_[0]=~/%[a-d][lh]/?"addb":"addl",@_); } sub main'adc { &out2("adcl",@_); } sub main'sub { &out2("subl",@_); } sub main'sbb { &out2("sbbl",@_); } @@ -189,7 +189,7 @@ sub main'jc { &out1("jc",@_); } sub main'jnc { &out1("jnc",@_); } sub main'jno { &out1("jno",@_); } sub main'dec { &out1("decl",@_); } -sub main'inc { &out1("incl",@_); } +sub main'inc { &out1($_[0]=~/%[a-d][hl]/?"incb":"incl",@_); } sub main'push { &out1("pushl",@_); $stack+=4; } sub main'pop { &out1("popl",@_); $stack-=4; } sub main'pushf { &out0("pushfl"); $stack+=4; } @@ -205,9 +205,10 @@ sub main'nop { &out0("nop"); } sub main'test { &out2("testl",@_); } sub main'bt { &out2("btl",@_); } sub main'leave { &out0("leave"); } -sub main'cpuid { &out0(".byte 0x0f; .byte 0xa2"); } -sub main'rdtsc { &out0(".byte 0x0f; .byte 0x31"); } +sub main'cpuid { &out0(".byte\t0x0f,0xa2"); } +sub main'rdtsc { &out0(".byte\t0x0f,0x31"); } sub main'halt { &out0("hlt"); } +sub main'movz { &out2("movzb",@_); } # SSE2 sub main'emms { &out0("emms"); } @@ -558,7 +559,7 @@ sub main'file_end pushl %ebx movl %edx,%edi movl \$1,%eax - .byte 0x0f; .byte 0xa2 + .byte 0x0f,0xa2 orl \$1<<10,%edx movl %edx,0(%edi) popl %ebx -- cgit v1.2.3