summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJeff Johnston <jjohnstn@redhat.com>2008-05-26 23:23:15 +0000
committerJeff Johnston <jjohnstn@redhat.com>2008-05-26 23:23:15 +0000
commita6bd72a27873294887681d3bd102d848e5777e2c (patch)
tree4da6a66d14c0993b5445d9bf6c5df596b72c47ed
parentcae28869c106eb342dd5a1c8242f933efab6f772 (diff)
downloadcygnal-a6bd72a27873294887681d3bd102d848e5777e2c.tar.gz
cygnal-a6bd72a27873294887681d3bd102d848e5777e2c.tar.bz2
cygnal-a6bd72a27873294887681d3bd102d848e5777e2c.zip
2008-05-26 Eric Blake <ebb9@byu.net>
Optimize the generic and x86 memset. * libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]: Pre-align pointer so unaligned stores aren't penalized. * libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]: Pre-align pointer so unaligned stores aren't penalized. Prefer 8-byte over 4-byte alignment. Reduce register pressure.
-rw-r--r--newlib/ChangeLog9
-rw-r--r--newlib/libc/machine/i386/memset.S68
-rw-r--r--newlib/libc/string/memset.c51
3 files changed, 85 insertions, 43 deletions
diff --git a/newlib/ChangeLog b/newlib/ChangeLog
index 74fe2fd4d..02670c59c 100644
--- a/newlib/ChangeLog
+++ b/newlib/ChangeLog
@@ -1,5 +1,14 @@
2008-05-26 Eric Blake <ebb9@byu.net>
+ Optimize the generic and x86 memset.
+ * libc/string/memset.c (memset) [!__OPTIMIZE_SIZE__]:
+ Pre-align pointer so unaligned stores aren't penalized.
+ * libc/machine/i386/memset.S (memset): [!__OPTIMIZE_SIZE__]:
+ Pre-align pointer so unaligned stores aren't penalized. Prefer
+ 8-byte over 4-byte alignment. Reduce register pressure.
+
+2008-05-26 Eric Blake <ebb9@byu.net>
+
Optimize the generic and x86 strlen.
* libc/string/strlen.c (strlen) [!__OPTIMIZE_SIZE__]: Pre-align
data so unaligned searches aren't penalized.
diff --git a/newlib/libc/machine/i386/memset.S b/newlib/libc/machine/i386/memset.S
index ce40820ff..36637fc21 100644
--- a/newlib/libc/machine/i386/memset.S
+++ b/newlib/libc/machine/i386/memset.S
@@ -1,6 +1,6 @@
/*
* ====================================================
- * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved.
+ * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved.
*
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
@@ -18,43 +18,83 @@ SYM (memset):
pushl ebp
movl esp,ebp
pushl edi
- pushl ebx
movl 8(ebp),edi
movl 12(ebp),eax
movl 16(ebp),ecx
cld
#ifndef __OPTIMIZE_SIZE__
- andl $255,eax
- movl ecx,ebx
- testl $3,edi
- jne .L19
+/* Less than 16 bytes won't benefit from the 'rep stosl' loop. */
cmpl $16,ecx
jbe .L19
+ cbw
+ testl $7,edi
+ je .L10
- movl eax,edx
- sall $8,eax
- orl edx,eax
+/* It turns out that 8-byte aligned 'rep stosl' outperforms
+ 4-byte aligned on some x86 platforms. */
+ movb al,(edi)
+ incl edi
+ decl ecx
+ testl $7,edi
+ je .L10
+
+ movb al,(edi)
+ incl edi
+ decl ecx
+ testl $7,edi
+ je .L10
+
+ movb al,(edi)
+ incl edi
+ decl ecx
+ testl $7,edi
+ je .L10
+
+ movb al,(edi)
+ incl edi
+ decl ecx
+ testl $7,edi
+ je .L10
+ movb al,(edi)
+ incl edi
+ decl ecx
+ testl $7,edi
+ je .L10
+
+ movb al,(edi)
+ incl edi
+ decl ecx
+ testl $7,edi
+ je .L10
+
+ movb al,(edi)
+ incl edi
+ decl ecx
+
+/* At this point, ecx>8 and edi%8==0. */
+.L10:
+ movb al,ah
movl eax,edx
sall $16,edx
orl edx,eax
+ movl ecx,edx
shrl $2,ecx
- andl $3,ebx
+ andl $3,edx
rep
stosl
- movl ebx,ecx
+ movl edx,ecx
#endif /* not __OPTIMIZE_SIZE__ */
-
+
.L19:
rep
stosb
movl 8(ebp),eax
- leal -8(ebp),esp
- popl ebx
+ leal -4(ebp),esp
popl edi
leave
ret
diff --git a/newlib/libc/string/memset.c b/newlib/libc/string/memset.c
index ac3590ea4..8dbb5f85d 100644
--- a/newlib/libc/string/memset.c
+++ b/newlib/libc/string/memset.c
@@ -22,7 +22,7 @@ DESCRIPTION
pointed to by <[dst]> to the value.
RETURNS
- <<memset>> returns the value of <[m]>.
+ <<memset>> returns the value of <[dst]>.
PORTABILITY
<<memset>> is ANSI C.
@@ -39,48 +39,42 @@ QUICKREF
#define UNALIGNED(X) ((long)X & (LBLOCKSIZE - 1))
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)
-_PTR
+_PTR
_DEFUN (memset, (m, c, n),
_PTR m _AND
int c _AND
size_t n)
{
-#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
char *s = (char *) m;
- while (n-- != 0)
- {
- *s++ = (char) c;
- }
-
- return m;
-#else
- char *s = (char *) m;
+#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
int i;
unsigned long buffer;
unsigned long *aligned_addr;
unsigned int d = c & 0xff; /* To avoid sign extension, copy C to an
unsigned variable. */
- if (!TOO_SMALL (n) && !UNALIGNED (m))
+ while (UNALIGNED (s))
{
- /* If we get this far, we know that n is large and m is word-aligned. */
- aligned_addr = (unsigned long*)m;
+ if (n--)
+ *s++ = (char) c;
+ else
+ return m;
+ }
+
+ if (!TOO_SMALL (n))
+ {
+ /* If we get this far, we know that n is large and s is word-aligned. */
+ aligned_addr = (unsigned long *) s;
/* Store D into each char sized location in BUFFER so that
we can set large blocks quickly. */
- if (LBLOCKSIZE == 4)
- {
- buffer = (d << 8) | d;
- buffer |= (buffer << 16);
- }
- else
- {
- buffer = 0;
- for (i = 0; i < LBLOCKSIZE; i++)
- buffer = (buffer << 8) | d;
- }
+ buffer = (d << 8) | d;
+ buffer |= (buffer << 16);
+ for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
+ buffer = (buffer << i) | buffer;
+ /* Unroll the loop. */
while (n >= LBLOCKSIZE*4)
{
*aligned_addr++ = buffer;
@@ -99,11 +93,10 @@ _DEFUN (memset, (m, c, n),
s = (char*)aligned_addr;
}
+#endif /* not PREFER_SIZE_OVER_SPEED */
+
while (n--)
- {
- *s++ = (char)d;
- }
+ *s++ = (char) c;
return m;
-#endif /* not PREFER_SIZE_OVER_SPEED */
}