From a6bd72a27873294887681d3bd102d848e5777e2c Mon Sep 17 00:00:00 2001 From: Jeff Johnston Date: Mon, 26 May 2008 23:23:15 +0000 Subject: 2008-05-26 Eric Blake Optimize the generic and x86 memset. * libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]: Pre-align pointer so unaligned stores aren't penalized. * libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]: Pre-align pointer so unaligned stores aren't penalized. Prefer 8-byte over 4-byte alignment. Reduce register pressure. --- newlib/ChangeLog | 9 ++++++ newlib/libc/machine/i386/memset.S | 68 +++++++++++++++++++++++++++++++-------- newlib/libc/string/memset.c | 51 +++++++++++++---------------- 3 files changed, 85 insertions(+), 43 deletions(-) (limited to 'newlib') diff --git a/newlib/ChangeLog b/newlib/ChangeLog index 74fe2fd4d..02670c59c 100644 --- a/newlib/ChangeLog +++ b/newlib/ChangeLog @@ -1,3 +1,12 @@ +2008-05-26 Eric Blake + + Optimize the generic and x86 memset. + * libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]: + Pre-align pointer so unaligned stores aren't penalized. + * libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]: + Pre-align pointer so unaligned stores aren't penalized. Prefer + 8-byte over 4-byte alignment. Reduce register pressure. + 2008-05-26 Eric Blake Optimize the generic and x86 strlen. diff --git a/newlib/libc/machine/i386/memset.S b/newlib/libc/machine/i386/memset.S index ce40820ff..36637fc21 100644 --- a/newlib/libc/machine/i386/memset.S +++ b/newlib/libc/machine/i386/memset.S @@ -1,6 +1,6 @@ /* * ==================================================== - * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved. + * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved. * * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice @@ -18,43 +18,83 @@ SYM (memset): pushl ebp movl esp,ebp pushl edi - pushl ebx movl 8(ebp),edi movl 12(ebp),eax movl 16(ebp),ecx cld #ifndef __OPTIMIZE_SIZE__ - andl $255,eax - movl ecx,ebx - testl $3,edi - jne .L19 +/* Less than 16 bytes won't benefit from the 'rep stosl' loop. */ cmpl $16,ecx jbe .L19 + cbw + testl $7,edi + je .L10 - movl eax,edx - sall $8,eax - orl edx,eax +/* It turns out that 8-byte aligned 'rep stosl' outperforms + 4-byte aligned on some x86 platforms. */ + movb al,(edi) + incl edi + decl ecx + testl $7,edi + je .L10 + + movb al,(edi) + incl edi + decl ecx + testl $7,edi + je .L10 + + movb al,(edi) + incl edi + decl ecx + testl $7,edi + je .L10 + + movb al,(edi) + incl edi + decl ecx + testl $7,edi + je .L10 + movb al,(edi) + incl edi + decl ecx + testl $7,edi + je .L10 + + movb al,(edi) + incl edi + decl ecx + testl $7,edi + je .L10 + + movb al,(edi) + incl edi + decl ecx + +/* At this point, ecx>8 and edi%8==0. */ +.L10: + movb al,ah movl eax,edx sall $16,edx orl edx,eax + movl ecx,edx shrl $2,ecx - andl $3,ebx + andl $3,edx rep stosl - movl ebx,ecx + movl edx,ecx #endif /* not __OPTIMIZE_SIZE__ */ - + .L19: rep stosb movl 8(ebp),eax - leal -8(ebp),esp - popl ebx + leal -4(ebp),esp popl edi leave ret diff --git a/newlib/libc/string/memset.c b/newlib/libc/string/memset.c index ac3590ea4..8dbb5f85d 100644 --- a/newlib/libc/string/memset.c +++ b/newlib/libc/string/memset.c @@ -22,7 +22,7 @@ DESCRIPTION pointed to by <[dst]> to the value. RETURNS - <> returns the value of <[m]>. + <> returns the value of <[dst]>. PORTABILITY <> is ANSI C. @@ -39,48 +39,42 @@ QUICKREF #define UNALIGNED(X) ((long)X & (LBLOCKSIZE - 1)) #define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE) -_PTR +_PTR _DEFUN (memset, (m, c, n), _PTR m _AND int c _AND size_t n) { -#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) char *s = (char *) m; - while (n-- != 0) - { - *s++ = (char) c; - } - - return m; -#else - char *s = (char *) m; +#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__) int i; unsigned long buffer; unsigned long *aligned_addr; unsigned int d = c & 0xff; /* To avoid sign extension, copy C to an unsigned variable. */ - if (!TOO_SMALL (n) && !UNALIGNED (m)) + while (UNALIGNED (s)) { - /* If we get this far, we know that n is large and m is word-aligned. */ - aligned_addr = (unsigned long*)m; + if (n--) + *s++ = (char) c; + else + return m; + } + + if (!TOO_SMALL (n)) + { + /* If we get this far, we know that n is large and s is word-aligned. */ + aligned_addr = (unsigned long *) s; /* Store D into each char sized location in BUFFER so that we can set large blocks quickly. */ - if (LBLOCKSIZE == 4) - { - buffer = (d << 8) | d; - buffer |= (buffer << 16); - } - else - { - buffer = 0; - for (i = 0; i < LBLOCKSIZE; i++) - buffer = (buffer << 8) | d; - } + buffer = (d << 8) | d; + buffer |= (buffer << 16); + for (i = 32; i < LBLOCKSIZE * 8; i <<= 1) + buffer = (buffer << i) | buffer; + /* Unroll the loop. */ while (n >= LBLOCKSIZE*4) { *aligned_addr++ = buffer; @@ -99,11 +93,10 @@ _DEFUN (memset, (m, c, n), s = (char*)aligned_addr; } +#endif /* not PREFER_SIZE_OVER_SPEED */ + while (n--) - { - *s++ = (char)d; - } + *s++ = (char) c; return m; -#endif /* not PREFER_SIZE_OVER_SPEED */ } -- cgit v1.2.3