[parisc-linux-cvs] Re: DIFF 2.6.0-test4-pa7
Grant Grundler
grundler@parisc-linux.org
Sun, 31 Aug 2003 23:49:50 -0600
On Sun, Aug 31, 2003 at 06:30:42PM -0600, Grant Grundler wrote:
> Log message:
> 2.4.0-test4-pa7 Fast __ffs and cleanup atomic.h
>
> o commit final version of lamont's fast __ffs(). willy/jejb get credit
> for making sure I got semantics correct. I added 64-bit support.
> o remove dead code from atomic.h (#if 1/#else/#endif)
DIFF attached.
BTW, lamont's code is really quite clever. It basically unrolls the loop
and completely avoids branches - something that's very expensive
if mispredicted. More comments in the diff.
grant
Index: Makefile
===================================================================
RCS file: /var/cvs/linux-2.6/Makefile,v
retrieving revision 1.26
diff -u -p -r1.26 Makefile
--- Makefile 31 Aug 2003 21:08:50 -0000 1.26
+++ Makefile 1 Sep 2003 00:27:56 -0000
@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 0
-EXTRAVERSION = -test4-pa6
+EXTRAVERSION = -test4-pa7
# *DOCUMENTATION*
# To see a list of typical targets execute "make help"
Index: include/asm-parisc/atomic.h
===================================================================
RCS file: /var/cvs/linux-2.6/include/asm-parisc/atomic.h,v
retrieving revision 1.1
diff -u -p -r1.1 atomic.h
--- include/asm-parisc/atomic.h 29 Jul 2003 17:02:03 -0000 1.1
+++ include/asm-parisc/atomic.h 1 Sep 2003 00:27:56 -0000
@@ -49,27 +49,14 @@ extern spinlock_t __atomic_hash[ATOMIC_H
* Cache-line alignment would conflict with, for example, linux/module.h
*/
-typedef struct {
- volatile int counter;
-} atomic_t;
+typedef struct { volatile long counter; } atomic_t;
-/*
-** xchg/cmpxchg moved from asm/system.h - ggg
-*/
-
-#if 1
/* This should get optimized out since it's never called.
** Or get a link error if xchg is used "wrong".
*/
extern void __xchg_called_with_bad_pointer(void);
-#else
-static inline void __xchg_called_with_bad_pointer(void)
-{
- extern void panic(const char * fmt, ...);
- panic("xchg called with bad pointer");
-}
-#endif
+
/* __xchg32/64 defined in arch/parisc/lib/bitops.c */
extern unsigned long __xchg8(char, char *);
Index: include/asm-parisc/bitops.h
===================================================================
RCS file: /var/cvs/linux-2.6/include/asm-parisc/bitops.h,v
retrieving revision 1.1
diff -u -p -r1.1 bitops.h
--- include/asm-parisc/bitops.h 29 Jul 2003 17:02:03 -0000 1.1
+++ include/asm-parisc/bitops.h 1 Sep 2003 00:27:56 -0000
@@ -219,32 +219,64 @@ extern __inline__ unsigned long ffz(unsi
#ifdef __KERNEL__
/**
- * __ffs - find first bit in word.
+ * __ffs - find first bit in word. returns 0 to "BITS_PER_LONG-1".
* @word: The word to search
*
- * Undefined if no bit exists, so code should check against 0 first.
+ * __ffs() return is undefined if no bit is set.
+ *
+ * 32-bit fast __ffs by LaMont Jones "lamont At hp com".
+ * 64-bit enhancement by Grant Grundler "grundler At parisc-linux org".
+ * (with help from willy/jejb to get the semantics right)
+ *
+ * This algorithm avoids branches by making use of nullification.
+ * One side effect of "extr" instructions is it sets PSW[N] bit.
+ * How PSW[N] (nullify next insn) gets set is determined by the
+ * "condition" field (eg "<>" or "TR" below) in the extr* insn.
+ * Only the 1st and one of either the 2cd or 3rd insn will get executed.
+ * Each set of 3 insn will get executed in 2 cycles on PA8x00 vs 16 or so
+ * cycles for each mispredicted branch.
*/
-static __inline__ unsigned long __ffs(unsigned long word)
+
+static __inline__ unsigned long __ffs(unsigned long x)
{
- unsigned long result = 0;
+ unsigned long ret;
- while (!(word & 1UL)) {
- result++;
- word >>= 1;
- }
- return result;
+ __asm__(
+#if BITS_PER_LONG > 32
+ " ldi 63,%1\n"
+ " extrd,u,*<> %0,63,32,%%r0\n"
+ " extrd,u,*TR %0,31,32,%0\n"
+ " addi -32,%1,%1\n"
+#else
+ " ldi 31,%1\n"
+#endif
+ " extru,<> %0,31,16,%%r0\n"
+ " extru,TR %0,15,16,%0\n"
+ " addi -16,%1,%1\n"
+ " extru,<> %0,31,8,%%r0\n"
+ " extru,TR %0,23,8,%0\n"
+ " addi -8,%1,%1\n"
+ " extru,<> %0,31,4,%%r0\n"
+ " extru,TR %0,27,4,%0\n"
+ " addi -4,%1,%1\n"
+ " extru,<> %0,31,2,%%r0\n"
+ " extru,TR %0,29,2,%0\n"
+ " addi -2,%1,%1\n"
+ " extru,= %0,31,1,%%r0\n"
+ " addi -1,%1,%1\n"
+ : "+r" (x), "=r" (ret) );
+ return ret;
}
/*
- * ffs: find first bit set. This is defined the same way as
- * the libc and compiler builtin ffs routines, therefore
- * differs in spirit from the above ffz (man ffs).
+ * ffs: find first bit set. returns 1 to BITS_PER_LONG or 0 (if none set)
+ * This is defined the same way as the libc and compiler builtin
+ * ffs routines, therefore differs in spirit from the above ffz (man ffs).
*/
static __inline__ int ffs(int x)
{
- if (!x)
- return 0;
- return __ffs((unsigned long)x);
+ if (!x) return 0;
+ return __ffs((unsigned long)x) + 1;
}
/*