[parisc-linux-cvs] Re: DIFF 2.6.0-test4-pa7

Grant Grundler grundler@parisc-linux.org
Sun, 31 Aug 2003 23:49:50 -0600


On Sun, Aug 31, 2003 at 06:30:42PM -0600, Grant Grundler wrote:
> Log message:
> 2.4.0-test4-pa7 Fast __ffs and cleanup atomic.h
> 
> o commit final version of lamont's fast __ffs(). willy/jejb get credit
> for making sure I got semantics correct. I added 64-bit support.
> o remove dead code from atomic.h (#if 1/#else/#endif)

DIFF attached.

BTW, lamont's code is really quite clever. It basically unrolls the loop
and completely avoids branches - something that's very expensive
if mispredicted. More comments in the diff.

grant


Index: Makefile
===================================================================
RCS file: /var/cvs/linux-2.6/Makefile,v
retrieving revision 1.26
diff -u -p -r1.26 Makefile
--- Makefile	31 Aug 2003 21:08:50 -0000	1.26
+++ Makefile	1 Sep 2003 00:27:56 -0000
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 0
-EXTRAVERSION = -test4-pa6
+EXTRAVERSION = -test4-pa7
 
 # *DOCUMENTATION*
 # To see a list of typical targets execute "make help"
Index: include/asm-parisc/atomic.h
===================================================================
RCS file: /var/cvs/linux-2.6/include/asm-parisc/atomic.h,v
retrieving revision 1.1
diff -u -p -r1.1 atomic.h
--- include/asm-parisc/atomic.h	29 Jul 2003 17:02:03 -0000	1.1
+++ include/asm-parisc/atomic.h	1 Sep 2003 00:27:56 -0000
@@ -49,27 +49,14 @@ extern spinlock_t __atomic_hash[ATOMIC_H
  * Cache-line alignment would conflict with, for example, linux/module.h
  */
 
-typedef struct {
-	volatile int counter;
-} atomic_t;
+typedef struct { volatile long counter; } atomic_t;
 
 
-/*
-** xchg/cmpxchg moved from asm/system.h - ggg
-*/
-
-#if 1
 /* This should get optimized out since it's never called.
 ** Or get a link error if xchg is used "wrong".
 */
 extern void __xchg_called_with_bad_pointer(void);
-#else
-static inline void __xchg_called_with_bad_pointer(void)
-{
-	extern void panic(const char * fmt, ...);
-	panic("xchg called with bad pointer");
-}
-#endif
+
 
 /* __xchg32/64 defined in arch/parisc/lib/bitops.c */
 extern unsigned long __xchg8(char, char *);
Index: include/asm-parisc/bitops.h
===================================================================
RCS file: /var/cvs/linux-2.6/include/asm-parisc/bitops.h,v
retrieving revision 1.1
diff -u -p -r1.1 bitops.h
--- include/asm-parisc/bitops.h	29 Jul 2003 17:02:03 -0000	1.1
+++ include/asm-parisc/bitops.h	1 Sep 2003 00:27:56 -0000
@@ -219,32 +219,64 @@ extern __inline__ unsigned long ffz(unsi
 #ifdef __KERNEL__
 
 /**
- * __ffs - find first bit in word.
+ * __ffs - find first bit in word. returns 0 to "BITS_PER_LONG-1".
  * @word: The word to search
  *
- * Undefined if no bit exists, so code should check against 0 first.
+ * __ffs() return is undefined if no bit is set.
+ *
+ * 32-bit fast __ffs by LaMont Jones "lamont At hp com".
+ * 64-bit enhancement by Grant Grundler "grundler At parisc-linux org".
+ * (with help from willy/jejb to get the semantics right)
+ *
+ * This algorithm avoids branches by making use of nullification.
+ * One side effect of "extr" instructions is it sets PSW[N] bit.
+ * How PSW[N] (nullify next insn) gets set is determined by the 
+ * "condition" field (eg "<>" or "TR" below) in the extr* insn.
+ * Only the 1st and one of either the 2cd or 3rd insn will get executed.
+ * Each set of 3 insn will get executed in 2 cycles on PA8x00 vs 16 or so
+ * cycles for each mispredicted branch.
  */
-static __inline__ unsigned long __ffs(unsigned long word)
+
+static __inline__ unsigned long __ffs(unsigned long x)
 {
-	unsigned long result = 0;
+	unsigned long ret;
 
-	while (!(word & 1UL)) {
-		result++;
-		word >>= 1;
-	}
-	return result;
+	__asm__(
+#if BITS_PER_LONG > 32
+		" ldi       63,%1\n"
+		" extrd,u,*<>  %0,63,32,%%r0\n"
+		" extrd,u,*TR  %0,31,32,%0\n"
+		" addi    -32,%1,%1\n"
+#else
+		" ldi       31,%1\n"
+#endif
+		" extru,<>  %0,31,16,%%r0\n"
+		" extru,TR  %0,15,16,%0\n"
+		" addi    -16,%1,%1\n"
+		" extru,<>  %0,31,8,%%r0\n"
+		" extru,TR  %0,23,8,%0\n"
+		" addi    -8,%1,%1\n"
+		" extru,<>  %0,31,4,%%r0\n"
+		" extru,TR  %0,27,4,%0\n"
+		" addi    -4,%1,%1\n"
+		" extru,<>  %0,31,2,%%r0\n"
+		" extru,TR  %0,29,2,%0\n"
+		" addi    -2,%1,%1\n"
+		" extru,=  %0,31,1,%%r0\n"
+		" addi    -1,%1,%1\n"
+		: "+r" (x), "=r" (ret) );
+	return ret;
 }
 
 /*
- * ffs: find first bit set. This is defined the same way as
- * the libc and compiler builtin ffs routines, therefore
- * differs in spirit from the above ffz (man ffs).
+ * ffs: find first bit set. returns 1 to BITS_PER_LONG or 0 (if none set)
+ * This is defined the same way as the libc and compiler builtin
+ * ffs routines, therefore differs in spirit from the above ffz (man ffs).
  */
 static __inline__ int ffs(int x)
 {
-	if (!x)
-		return 0;
-	return __ffs((unsigned long)x);
+	if (!x) return 0;
+	return __ffs((unsigned long)x) + 1;
 }
 
 /*